def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]} new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position']-5,coord_df['position']+5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3=True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site-five_site)/1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6] s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome', 'strand','transcript','position','type']] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None): pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) quant_df = junc_df[(junc_df['type'] != '3prime') & (junc_df['looks like'] != 'AG')] new_intron_size = [] alt_splice = [] score_3 = [] score_5 = [] seq5 = [] seq3 = [] new_quant_df = pd.DataFrame(index=set(quant_df.index), columns=['intron size','alt splicing']) for coord in new_quant_df.index: coord_df = quant_df[quant_df.index == coord] #Determine if multiple junctions come from this peak if len(coord_df) > 1: alt_splice.append(True) else: alt_splice.append(False) if max(coord_df['annotated intron size']) > 0: coord_df = coord_df.sort_values('annotated intron size', ascending=False) new_intron_size.append(coord_df.ix[0]['annotated intron size']/1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) score_3.append(coord_df.ix[0]['annotated 3p score']) score_5.append(coord_df.ix[0]['annotated 5p score']) else: coord_df = coord_df.sort_values('junction size', ascending=False) new_intron_size.append(coord_df.ix[0]['junction size']/1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) scores = SP.simple_score_junction(coord_df.ix[0]['junction sequence1'], coord_df.ix[0]['junction sequence2'], pssm) score_3.append(scores[1]) score_5.append(scores[0]) new_quant_df['intron size'] = new_intron_size new_quant_df['alt splicing'] = alt_splice new_quant_df['5p score'] = score_5 new_quant_df['3p score'] = score_3 new_quant_df['seq5'] = seq5 new_quant_df['seq3'] = seq3 quant_df = quant_df.sort_values('annotated intron size') quant_df = quant_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') new_quant_df = new_quant_df.merge(quant_df[['transcript','chromosome','position','strand','type']], right_index=True, left_index=True) for coord in set(branch_df['genome coord']): if coord not in new_quant_df.index: coord_df = branch_df[branch_df['genome coord'] == coord] coord_df = coord_df.sort_values('depth') best = coord_df.iloc[0] coord_dict = {'transcript':best['transcript'][:-2], 'chromosome':best['chromosome'], 'position':best['5p splice site'], 'strand':best['strand'], 'type':best['type'], 'intron size':best['intron size'], 'alt splicing':np.where(len(coord_df)> 1, True, False), '5p score':np.NaN, '3p score':np.NaN, 'seq5':'','seq3':''} if len(best['5p seq']) > 0: coord_dict['seq5'] = best['5p seq'] else: if best['strand'] == '+': coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-1):(int(best['5p splice site'])+7)] elif best['strand'] == '-': coord_dict['seq5'] = fa_dict[best['chromosome']][(int(best['5p splice site'])-6):(int(best['5p splice site'])+2)] coord_dict['seq5'] = SP.reverse_complement(coord_dict['seq5']) if str(best['3p splice site']) != 'nan': three_site = best['3p splice site'] else: if best['strand'] == '+': after_branch = fa_dict[best['chromosome']][best['branch site']:best['branch site']+100] elif best['strand'] == '-': after_branch = fa_dict[best['chromosome']][best['branch site']-100:best['branch site']] after_branch = SP.reverse_complement(after_branch) for subs in ['TAG','CAG','GAG','AAG']: if subs in after_branch: ix = after_branch.find(subs)+3 break three_site = best['branch site']+ix if best['strand'] == '-': three_site = best['branch site']-ix coord_dict['intron size'] = abs(coord_dict['position']-three_site) if best['strand'] == '+': coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site-5):int(three_site)+3] elif best['strand'] == '-': coord_dict['seq3'] = fa_dict[best['chromosome']][int(three_site)-2:int(three_site)+6] coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3']) coord_dict['5p score'], coord_dict['3p score'] = SP.simple_score_junction(coord_dict['seq5'], coord_dict['seq3'], pssm) coord_s = pd.Series(coord_dict, name=coord) new_quant_df = new_quant_df.append(coord_s) new_quant_df = backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) for n in range(len(new_quant_df['seq5'].iloc[0])): new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] for n in range(len(new_quant_df['seq3'].iloc[0])): new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) lariat_df = junc_df[(junc_df['type'] == '3prime') | (junc_df['looks like'] == 'AG')] lariat_df = lariat_df.sort_values(['genome coord','annotated intron size'], ascending=False) lariat_df = lariat_df.reset_index(drop=True).drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') lariat_df = lariat_df[['transcript','chromosome','position','strand','type']] return new_quant_df, lariat_df
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[], 'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[], 'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x:x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak)-5,int(peak)+5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx+'.1' else: iso = tx+'T0' column_dict['intron size'].append(abs(three_site-five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n+1][0] - three_site)/1000. try: if ds_length < 0: ds_length = (splice_sites[n+2][0] - three_site)/1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site)/1000. elif strand == '-': column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n+1][0])/1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n+2][0])/1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0])/1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites)-1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) if strand == '+': column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.) elif strand == '-': column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.) if in_df is True: peak_index = chrom+':'+str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index,'3p score']) column_dict['5p score'].append(df.loc[peak_index,'5p score']) column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing']) column_dict['type'].append(df.loc[peak_index,'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index,'seq5']) column_dict['seq3'].append(df.loc[peak_index,'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site+1) new_index.append(chrom+':'+str(five_site+1)) sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)] sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)] elif strand == '-': column_dict['position'].append(five_site-1) new_index.append(chrom+':'+str(five_site-1)) sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat( quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = { 'intron size': [], 'alt splicing': [], '5p score': [], '3p score': [], 'seq5': [], 'seq3': [] } new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position'] - 5, coord_df['position'] + 5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3 = True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[ (quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site - five_site) / 1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site + 6] s3 = fa_dict[coord_df['chromosome']][three_site - 6:three_site + 2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site + 2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site - 2:three_site + 6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][[ 'genome coord', 'chromosome', 'strand', 'transcript', 'position', 'type' ]] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy( new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def make_quant_df(junc_df, branch_df, gff3, fa_dict, organism=None): pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) quant_df = junc_df[(junc_df['type'] != '3prime') & (junc_df['looks like'] != 'AG')] new_intron_size = [] alt_splice = [] score_3 = [] score_5 = [] seq5 = [] seq3 = [] new_quant_df = pd.DataFrame(index=set(quant_df.index), columns=['intron size', 'alt splicing']) for coord in new_quant_df.index: coord_df = quant_df[quant_df.index == coord] #Determine if multiple junctions come from this peak if len(coord_df) > 1: alt_splice.append(True) else: alt_splice.append(False) if max(coord_df['annotated intron size']) > 0: coord_df = coord_df.sort_values('annotated intron size', ascending=False) new_intron_size.append(coord_df.ix[0]['annotated intron size'] / 1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) score_3.append(coord_df.ix[0]['annotated 3p score']) score_5.append(coord_df.ix[0]['annotated 5p score']) else: coord_df = coord_df.sort_values('junction size', ascending=False) new_intron_size.append(coord_df.ix[0]['junction size'] / 1000.) seq5.append(coord_df.ix[0]['junction sequence1']) seq3.append(coord_df.ix[0]['junction sequence2']) scores = SP.simple_score_junction( coord_df.ix[0]['junction sequence1'], coord_df.ix[0]['junction sequence2'], pssm) score_3.append(scores[1]) score_5.append(scores[0]) new_quant_df['intron size'] = new_intron_size new_quant_df['alt splicing'] = alt_splice new_quant_df['5p score'] = score_5 new_quant_df['3p score'] = score_3 new_quant_df['seq5'] = seq5 new_quant_df['seq3'] = seq3 quant_df = quant_df.sort_values('annotated intron size') quant_df = quant_df.reset_index(drop=True).drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') new_quant_df = new_quant_df.merge( quant_df[['transcript', 'chromosome', 'position', 'strand', 'type']], right_index=True, left_index=True) for coord in set(branch_df['genome coord']): if coord not in new_quant_df.index: coord_df = branch_df[branch_df['genome coord'] == coord] coord_df = coord_df.sort_values('depth') best = coord_df.iloc[0] coord_dict = { 'transcript': best['transcript'][:-2], 'chromosome': best['chromosome'], 'position': best['5p splice site'], 'strand': best['strand'], 'type': best['type'], 'intron size': best['intron size'], 'alt splicing': np.where(len(coord_df) > 1, True, False), '5p score': np.NaN, '3p score': np.NaN, 'seq5': '', 'seq3': '' } if len(best['5p seq']) > 0: coord_dict['seq5'] = best['5p seq'] else: if best['strand'] == '+': coord_dict['seq5'] = fa_dict[best['chromosome']][( int(best['5p splice site']) - 1):(int(best['5p splice site']) + 7)] elif best['strand'] == '-': coord_dict['seq5'] = fa_dict[best['chromosome']][( int(best['5p splice site']) - 6):(int(best['5p splice site']) + 2)] coord_dict['seq5'] = SP.reverse_complement( coord_dict['seq5']) if str(best['3p splice site']) != 'nan': three_site = best['3p splice site'] else: if best['strand'] == '+': after_branch = fa_dict[best['chromosome']][ best['branch site']:best['branch site'] + 100] elif best['strand'] == '-': after_branch = fa_dict[ best['chromosome']][best['branch site'] - 100:best['branch site']] after_branch = SP.reverse_complement(after_branch) for subs in ['TAG', 'CAG', 'GAG', 'AAG']: if subs in after_branch: ix = after_branch.find(subs) + 3 break three_site = best['branch site'] + ix if best['strand'] == '-': three_site = best['branch site'] - ix coord_dict['intron size'] = abs(coord_dict['position'] - three_site) if best['strand'] == '+': coord_dict['seq3'] = fa_dict[ best['chromosome']][int(three_site - 5):int(three_site) + 3] elif best['strand'] == '-': coord_dict['seq3'] = fa_dict[ best['chromosome']][int(three_site) - 2:int(three_site) + 6] coord_dict['seq3'] = SP.reverse_complement(coord_dict['seq3']) coord_dict['5p score'], coord_dict[ '3p score'] = SP.simple_score_junction(coord_dict['seq5'], coord_dict['seq3'], pssm) coord_s = pd.Series(coord_dict, name=coord) new_quant_df = new_quant_df.append(coord_s) new_quant_df = backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) for n in range(len(new_quant_df['seq5'].iloc[0])): new_quant_df['Base 5-' + str(n)] = [x[n] for x in new_quant_df['seq5']] for n in range(len(new_quant_df['seq3'].iloc[0])): new_quant_df['Base 3-' + str(n)] = [x[n] for x in new_quant_df['seq3']] new_quant_df = new_quant_df.drop(['seq5', 'seq3'], axis=1) lariat_df = junc_df[(junc_df['type'] == '3prime') | (junc_df['looks like'] == 'AG')] lariat_df = lariat_df.sort_values( ['genome coord', 'annotated intron size'], ascending=False) lariat_df = lariat_df.reset_index(drop=True).drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') lariat_df = lariat_df[[ 'transcript', 'chromosome', 'position', 'strand', 'type' ]] return new_quant_df, lariat_df
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = { 'position': [], 'transcript': [], 'alt splicing': [], 'type': [], 'strand': [], 'introns in transcript': [], 'intron size': [], 'chromosome': [], '5p score': [], '3p score': [], 'intron position': [], 'exon size (us)': [], 'exon size (ds)': [], 'transcript size': [], 'peak': [], 'seq5': [], 'seq3': [] } new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x: x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x: x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak) - 5, int(peak) + 5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx + '.1' else: iso = tx + 'T0' column_dict['intron size'].append(abs(three_site - five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append( (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append( (five_site - tx_dict[iso][0]) / 1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n + 1][0] - three_site) / 1000. try: if ds_length < 0: ds_length = (splice_sites[n + 2][0] - three_site) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site) / 1000. elif strand == '-': column_dict['exon size (us)'].append( (tx_dict[iso][1] - five_site) / 1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n + 1][0]) / 1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n + 2][0]) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0]) / 1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites) - 1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) if strand == '+': column_dict['exon size (ds)'].append( (tx_dict[iso][1] - three_site) / 1000.) elif strand == '-': column_dict['exon size (ds)'].append( (three_site - tx_dict[iso][0]) / 1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) column_dict['exon size (ds)'].append( abs(three_site - splice_sites[n + 1][0]) / 1000.) if in_df is True: peak_index = chrom + ':' + str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index, '3p score']) column_dict['5p score'].append(df.loc[peak_index, '5p score']) column_dict['alt splicing'].append(df.loc[peak_index, 'alt splicing']) column_dict['type'].append(df.loc[peak_index, 'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index, 'seq5']) column_dict['seq3'].append(df.loc[peak_index, 'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site + 1) new_index.append(chrom + ':' + str(five_site + 1)) sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)] sequence2 = fa_dict[chrom][(three_site - 5):(three_site + 3)] elif strand == '-': column_dict['position'].append(five_site - 1) new_index.append(chrom + ':' + str(five_site - 1)) sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site - 2):(three_site + 6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction( sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df