Exemplo n.º 1
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0
    
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)
    
    column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]}
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position']-5,coord_df['position']+5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0:
                alt3=True
        else:
            if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']
        
        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site-five_site)/1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)
            
            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6]
                s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])
            
    new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome',
                                                             'strand','transcript','position','type']]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data
    
    new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')
    
    new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism)
    
    #for n in range(len(new_quant_df['seq5'].iloc[0])):     
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)
    
    new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict)
    
    return new_quant_df
Exemplo n.º 2
0
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None):
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
        
    quant_df = junc_df[(junc_df['type'] != '3prime') & (junc_df['looks like'] != 'AG')]
    
    new_intron_size = []
    alt_splice = []
    score_3 = []
    score_5 = []
    seq5 = []
    seq3 = []

    new_quant_df = pd.DataFrame(index=set(quant_df.index), columns=['intron size','alt splicing'])
    for coord in new_quant_df.index:
        coord_df = quant_df[quant_df.index == coord]

        #Determine if multiple junctions come from this peak
        if len(coord_df) > 1: alt_splice.append(True)
        else: alt_splice.append(False)

        if max(coord_df['annotated intron size']) > 0:
            coord_df = coord_df.sort_values('annotated intron size', ascending=False)
            new_intron_size.append(coord_df.ix[0]['annotated intron size']/1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            score_3.append(coord_df.ix[0]['annotated 3p score'])
            score_5.append(coord_df.ix[0]['annotated 5p score'])
            
        else:
            coord_df = coord_df.sort_values('junction size', ascending=False)
            new_intron_size.append(coord_df.ix[0]['junction size']/1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            scores = SP.simple_score_junction(coord_df.ix[0]['junction sequence1'], coord_df.ix[0]['junction sequence2'], pssm)
            score_3.append(scores[1])
            score_5.append(scores[0])
            
    new_quant_df['intron size'] = new_intron_size
    new_quant_df['alt splicing'] = alt_splice
    new_quant_df['5p score'] = score_5
    new_quant_df['3p score'] = score_3
    new_quant_df['seq5'] = seq5
    new_quant_df['seq3'] = seq3

    quant_df = quant_df.sort_values('annotated intron size')
    quant_df = quant_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = new_quant_df.merge(quant_df[['transcript','chromosome','position','strand','type']], right_index=True, left_index=True)
    
    for coord in set(branch_df['genome coord']):
        if coord not in new_quant_df.index:
            coord_df = branch_df[branch_df['genome coord'] == coord]
            coord_df = coord_df.sort_values('depth')
            best = coord_df.iloc[0]
            coord_dict = {'transcript':best['transcript'][:-2], 
                         'chromosome':best['chromosome'],
                         'position':best['5p splice site'],
                         'strand':best['strand'],
                         'type':best['type'],
                         'intron size':best['intron size'],
                         'alt splicing':np.where(len(coord_df)> 1, True, False),
                         '5p score':np.NaN,
                         '3p score':np.NaN,
                         'seq5':'','seq3':''}

            
            if len(best['5p seq']) > 0:
                coord_dict['seq5'] = best['5p seq']
            else:
                if best['strand'] == '+':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-1):(int(best['5p splice site'])+7)]
                elif best['strand'] == '-':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-6):(int(best['5p splice site'])+2)]
                    coord_dict['seq5'] = SP.reverse_complement(coord_dict['seq5'])
                    
            if str(best['3p splice site']) != 'nan':
                three_site = best['3p splice site']
            else:
                if best['strand'] == '+':
                    after_branch = fa_dict[best['chromosome']][best['branch site']:best['branch site']+100]
                elif best['strand'] == '-':
                    after_branch = fa_dict[best['chromosome']][best['branch site']-100:best['branch site']]
                    after_branch = SP.reverse_complement(after_branch)
                for subs in ['TAG','CAG','GAG','AAG']:
                    if subs in after_branch:
                        ix = after_branch.find(subs)+3
                        break
                three_site = best['branch site']+ix
                if best['strand'] == '-':
                    three_site = best['branch site']-ix
                coord_dict['intron size'] = abs(coord_dict['position']-three_site)
            
            if best['strand'] == '+':
                coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site-5):int(three_site)+3]
            elif best['strand'] == '-':
                coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site)-2:int(three_site)+6]
                coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3'])
                    
            coord_dict['5p score'], coord_dict['3p score'] = SP.simple_score_junction(coord_dict['seq5'], coord_dict['seq3'], pssm)
            coord_s = pd.Series(coord_dict, name=coord)
            new_quant_df = new_quant_df.append(coord_s)
    
    new_quant_df = backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism)
    
    for n in range(len(new_quant_df['seq5'].iloc[0])):     
        new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    for n in range(len(new_quant_df['seq3'].iloc[0])):
        new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)
    
    lariat_df = junc_df[(junc_df['type'] == '3prime') | (junc_df['looks like'] == 'AG')]
    lariat_df = lariat_df.sort_values(['genome coord','annotated intron size'], ascending=False)
    lariat_df = lariat_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')
    lariat_df = lariat_df[['transcript','chromosome','position','strand','type']]
    
    return new_quant_df, lariat_df
Exemplo n.º 3
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[],
                   'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[],
                   'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} 
    new_index = []
    
    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True)
        
        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak)-5,int(peak)+5):
                    in_df = True
                    df_pos = peak
                    break
            
            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx+'.1'
            else: iso = tx+'T0'
            
            column_dict['intron size'].append(abs(three_site-five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)   
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n+1][0] - three_site)/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n+2][0] - three_site)/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site)/1000.
                    
                elif strand == '-':
                    column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site - splice_sites[n+1][0])/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site - splice_sites[n+2][0])/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0])/1000.
                column_dict['exon size (ds)'].append(ds_length)
            
            elif n == len(splice_sites)-1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                
                if strand == '+':
                    column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.)

            if in_df is True:
                peak_index = chrom+':'+str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index,'3p score'])
                column_dict['5p score'].append(df.loc[peak_index,'5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing'])
                column_dict['type'].append(df.loc[peak_index,'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index,'seq5'])
                column_dict['seq3'].append(df.loc[peak_index,'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)
                
                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site+1)
                    new_index.append(chrom+':'+str(five_site+1))
                    sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)]
                    sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)]
                
                elif strand == '-':
                    column_dict['position'].append(five_site-1)
                    new_index.append(chrom+':'+str(five_site-1))
                    sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)
                
                # Score sequences
                score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)
    
    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data
    
    return new_df
Exemplo n.º 4
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0

    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    quant_df = peak_df[(peak_df['type'] != '3prime')
                       & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(
        quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)

    column_dict = {
        'intron size': [],
        'alt splicing': [],
        '5p score': [],
        '3p score': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position'] - 5,
                               coord_df['position'] + 5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript'])
                            & (quant_df['type'] == 'AG')]) > 0:
                alt3 = True
        else:
            if 'AG' in quant_df[quant_df['transcript'] ==
                                coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[
                    (quant_df['transcript'] == coord_df['transcript'])
                    & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']

        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site - five_site) / 1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)

            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site +
                                                     6]
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     6:three_site + 2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site +
                                                     2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     2:three_site + 6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])

    new_quant_df = quant_df[quant_df.index.isin(new_index)][[
        'genome coord', 'chromosome', 'strand', 'transcript', 'position',
        'type'
    ]]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data

    new_quant_df = new_quant_df.drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = SP.backfill_splice_sites(new_quant_df,
                                            gff3,
                                            fa_dict,
                                            pssm,
                                            organism=organism)

    #for n in range(len(new_quant_df['seq5'].iloc[0])):
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)

    new_quant_df = SP.find_score_branches_ppy(
        new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt',
        fa_dict)

    return new_quant_df
Exemplo n.º 5
0
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None):
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)

    quant_df = junc_df[(junc_df['type'] != '3prime')
                       & (junc_df['looks like'] != 'AG')]

    new_intron_size = []
    alt_splice = []
    score_3 = []
    score_5 = []
    seq5 = []
    seq3 = []

    new_quant_df = pd.DataFrame(index=set(quant_df.index),
                                columns=['intron size', 'alt splicing'])
    for coord in new_quant_df.index:
        coord_df = quant_df[quant_df.index == coord]

        #Determine if multiple junctions come from this peak
        if len(coord_df) > 1: alt_splice.append(True)
        else: alt_splice.append(False)

        if max(coord_df['annotated intron size']) > 0:
            coord_df = coord_df.sort_values('annotated intron size',
                                            ascending=False)
            new_intron_size.append(coord_df.ix[0]['annotated intron size'] /
                                   1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            score_3.append(coord_df.ix[0]['annotated 3p score'])
            score_5.append(coord_df.ix[0]['annotated 5p score'])

        else:
            coord_df = coord_df.sort_values('junction size', ascending=False)
            new_intron_size.append(coord_df.ix[0]['junction size'] / 1000.)
            seq5.append(coord_df.ix[0]['junction sequence1'])
            seq3.append(coord_df.ix[0]['junction sequence2'])
            scores = SP.simple_score_junction(
                coord_df.ix[0]['junction sequence1'],
                coord_df.ix[0]['junction sequence2'], pssm)
            score_3.append(scores[1])
            score_5.append(scores[0])

    new_quant_df['intron size'] = new_intron_size
    new_quant_df['alt splicing'] = alt_splice
    new_quant_df['5p score'] = score_5
    new_quant_df['3p score'] = score_3
    new_quant_df['seq5'] = seq5
    new_quant_df['seq3'] = seq3

    quant_df = quant_df.sort_values('annotated intron size')
    quant_df = quant_df.reset_index(drop=True).drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = new_quant_df.merge(
        quant_df[['transcript', 'chromosome', 'position', 'strand', 'type']],
        right_index=True,
        left_index=True)

    for coord in set(branch_df['genome coord']):
        if coord not in new_quant_df.index:
            coord_df = branch_df[branch_df['genome coord'] == coord]
            coord_df = coord_df.sort_values('depth')
            best = coord_df.iloc[0]
            coord_dict = {
                'transcript': best['transcript'][:-2],
                'chromosome': best['chromosome'],
                'position': best['5p splice site'],
                'strand': best['strand'],
                'type': best['type'],
                'intron size': best['intron size'],
                'alt splicing': np.where(len(coord_df) > 1, True, False),
                '5p score': np.NaN,
                '3p score': np.NaN,
                'seq5': '',
                'seq3': ''
            }

            if len(best['5p seq']) > 0:
                coord_dict['seq5'] = best['5p seq']
            else:
                if best['strand'] == '+':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(
                        int(best['5p splice site']) -
                        1):(int(best['5p splice site']) + 7)]
                elif best['strand'] == '-':
                    coord_dict['seq5'] = fa_dict[best['chromosome']][(
                        int(best['5p splice site']) -
                        6):(int(best['5p splice site']) + 2)]
                    coord_dict['seq5'] = SP.reverse_complement(
                        coord_dict['seq5'])

            if str(best['3p splice site']) != 'nan':
                three_site = best['3p splice site']
            else:
                if best['strand'] == '+':
                    after_branch = fa_dict[best['chromosome']][
                        best['branch site']:best['branch site'] + 100]
                elif best['strand'] == '-':
                    after_branch = fa_dict[
                        best['chromosome']][best['branch site'] -
                                            100:best['branch site']]
                    after_branch = SP.reverse_complement(after_branch)
                for subs in ['TAG', 'CAG', 'GAG', 'AAG']:
                    if subs in after_branch:
                        ix = after_branch.find(subs) + 3
                        break
                three_site = best['branch site'] + ix
                if best['strand'] == '-':
                    three_site = best['branch site'] - ix
                coord_dict['intron size'] = abs(coord_dict['position'] -
                                                three_site)

            if best['strand'] == '+':
                coord_dict['seq3'] = fa_dict[
                    best['chromosome']][int(three_site - 5):int(three_site) +
                                        3]
            elif best['strand'] == '-':
                coord_dict['seq3'] = fa_dict[
                    best['chromosome']][int(three_site) - 2:int(three_site) +
                                        6]
                coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3'])

            coord_dict['5p score'], coord_dict[
                '3p score'] = SP.simple_score_junction(coord_dict['seq5'],
                                                       coord_dict['seq3'],
                                                       pssm)
            coord_s = pd.Series(coord_dict, name=coord)
            new_quant_df = new_quant_df.append(coord_s)

    new_quant_df = backfill_splice_sites(new_quant_df,
                                         gff3,
                                         fa_dict,
                                         pssm,
                                         organism=organism)

    for n in range(len(new_quant_df['seq5'].iloc[0])):
        new_quant_df['Base 5-' + str(n)] = [x[n] for x in new_quant_df['seq5']]
    for n in range(len(new_quant_df['seq3'].iloc[0])):
        new_quant_df['Base 3-' + str(n)] = [x[n] for x in new_quant_df['seq3']]
    new_quant_df = new_quant_df.drop(['seq5', 'seq3'], axis=1)

    lariat_df = junc_df[(junc_df['type'] == '3prime') |
                        (junc_df['looks like'] == 'AG')]
    lariat_df = lariat_df.sort_values(
        ['genome coord', 'annotated intron size'], ascending=False)
    lariat_df = lariat_df.reset_index(drop=True).drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')
    lariat_df = lariat_df[[
        'transcript', 'chromosome', 'position', 'strand', 'type'
    ]]

    return new_quant_df, lariat_df
Exemplo n.º 6
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {
        'position': [],
        'transcript': [],
        'alt splicing': [],
        'type': [],
        'strand': [],
        'introns in transcript': [],
        'intron size': [],
        'chromosome': [],
        '5p score': [],
        '3p score': [],
        'intron position': [],
        'exon size (us)': [],
        'exon size (ds)': [],
        'transcript size': [],
        'peak': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []

    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x: x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites),
                                  key=lambda x: x[0],
                                  reverse=True)

        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak) - 5, int(peak) + 5):
                    in_df = True
                    df_pos = peak
                    break

            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx + '.1'
            else:
                iso = tx + 'T0'

            column_dict['intron size'].append(abs(three_site - five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append(
                (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append(
                        (five_site - tx_dict[iso][0]) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n + 1][0] -
                                     three_site) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n + 2][0] -
                                             three_site) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site) / 1000.

                elif strand == '-':
                    column_dict['exon size (us)'].append(
                        (tx_dict[iso][1] - five_site) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site -
                                     splice_sites[n + 1][0]) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site -
                                             splice_sites[n + 2][0]) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0]) / 1000.
                column_dict['exon size (ds)'].append(ds_length)

            elif n == len(splice_sites) - 1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)

                if strand == '+':
                    column_dict['exon size (ds)'].append(
                        (tx_dict[iso][1] - three_site) / 1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append(
                        (three_site - tx_dict[iso][0]) / 1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)
                column_dict['exon size (ds)'].append(
                    abs(three_site - splice_sites[n + 1][0]) / 1000.)

            if in_df is True:
                peak_index = chrom + ':' + str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index, '3p score'])
                column_dict['5p score'].append(df.loc[peak_index, '5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,
                                                          'alt splicing'])
                column_dict['type'].append(df.loc[peak_index, 'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index, 'seq5'])
                column_dict['seq3'].append(df.loc[peak_index, 'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)

                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site + 1)
                    new_index.append(chrom + ':' + str(five_site + 1))
                    sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)]
                    sequence2 = fa_dict[chrom][(three_site - 5):(three_site +
                                                                 3)]

                elif strand == '-':
                    column_dict['position'].append(five_site - 1)
                    new_index.append(chrom + ':' + str(five_site - 1))
                    sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site - 2):(three_site +
                                                                 6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)

                # Score sequences
                score_5, score_3 = SP.simple_score_junction(
                    sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)

    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data

    return new_df