def add_intron_size(peaks_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) no_peaks = ss_dict intron_sizes = [] for index, row in peaks_df.iterrows(): if row['type'] != 'intronic': intron_sizes.append(np.NaN) else: sites = ss_dict[row['transcript']] assigned=False for pair in sites: if pair[0] > pair[1]: if row['position'] >= pair[1] and row['position'] <= pair[0]: intron_sizes.append(pair[0]-pair[1]) assigned=True no_peaks[row['transcript']].remove(pair) break else: if row['position'] >= pair[0] and row['position'] <= pair[1]: intron_sizes.append(pair[1]-pair[0]) assigned=True no_peaks[row['transcript']].remove(pair) break if assigned is False: intron_sizes.append(np.NaN) peaks_df['intron size'] = intron_sizes return peaks_df, no_peaks
def add_intron_size(peaks_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) no_peaks = ss_dict intron_sizes = [] for index, row in peaks_df.iterrows(): if row['type'] != 'intronic': intron_sizes.append(np.NaN) else: sites = ss_dict[row['transcript']] assigned = False for pair in sites: if pair[0] > pair[1]: if row['position'] >= pair[1] and row['position'] <= pair[ 0]: intron_sizes.append(pair[0] - pair[1]) assigned = True no_peaks[row['transcript']].remove(pair) break else: if row['position'] >= pair[0] and row['position'] <= pair[ 1]: intron_sizes.append(pair[1] - pair[0]) assigned = True no_peaks[row['transcript']].remove(pair) break if assigned is False: intron_sizes.append(np.NaN) peaks_df['intron size'] = intron_sizes return peaks_df, no_peaks
def find_3p_site(branch_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) three_coord = [] for ix, r in branch_df.iterrows(): introns = ss_dict[r['transcript'][:-2]] matched = False for intron in introns: if r['5p splice site'] in range(intron[0] - 1, intron[0] + 2): three_coord.append(intron[1]) matched = True break if matched is False: three_coord.append(np.NaN) branch_df['3p splice site'] = three_coord branch_df['intron size'] = branch_df['5p splice site'] - branch_df[ '3p splice site'] branch_df['intron size'] = branch_df['intron size'].apply(abs) branch_df['Branch to 3p distance'] = branch_df['branch site'] - branch_df[ '3p splice site'] branch_df['Branch to 3p distance'] = branch_df[ 'Branch to 3p distance'].apply(abs) return branch_df
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript+'.1' else: isoform = transcript+'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def generate_all_ss_seqs(gff3, fasta_dict, organism): transcript_dict = SP.build_transcript_dict(gff3, organism=organism) ss, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss) all_seq5 = [] all_seq3 = [] for transcript, introns in ss_dict.iteritems(): if organism == 'pombe': isoform = transcript + '.1' else: isoform = transcript + 'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: if strand == '+': seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)] elif strand == '-': seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)] seq5 = SP.reverse_complement(seq5) all_seq5.append(seq5) if strand == '+': seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)] elif strand == '-': seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)] seq3 = SP.reverse_complement(seq3) all_seq3.append(seq3) return all_seq5, all_seq3
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) == dict: fasta_dict = fasta_file elif fasta_file.endswith('json'): with open(fasta_file, 'r') as f: fasta_dict = json.load(f) else: fasta_dict = make_fasta_dict(fasta_file) if ss_dict is not None: ss_dict=ss_dict elif junction_bed is not None: ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) elif peak_df is not None: ss_dict = {} peak_df = peak_df[~peak_df['type'].str.contains('prime')] for ix, r in peak_df.iterrows(): if r['transcript'] not in ss_dict: ss_dict[r['transcript']] = [] if r['strand'] == '+': ss_dict[r['transcript']].append((r['position'],r['position']+50)) elif r['strand'] == '-': ss_dict[r['transcript']].append((r['position'],r['position']-50)) else: ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) seq_dict = {} for transcript, introns in ss_dict.iteritems(): if junction_bed is None: if organism == 'pombe': transcript = transcript+'.1' else: transcript = transcript+'T0' introns = list(introns) strand = transcript_dict[transcript][2] chrom = transcript_dict[transcript][3] n = 0 for n in range(len(introns)): if strand == '+': seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17] elif strand == '-': seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1] seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq) return seq_dict
def check_intron_position(transcript, position, gff3, organism): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) first=False last=False introns = ss_dict[transcript] for n, intron in enumerate(introns): if intron[0] in range(position-3,position+3): if n == 0: first = True elif n == len(intron): last = True break return first, last
def check_intron_position(transcript, position, gff3, organism): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) first = False last = False introns = ss_dict[transcript] for n, intron in enumerate(introns): if intron[0] in range(position - 3, position + 3): if n == 0: first = True elif n == len(intron): last = True break return first, last
def find_3p_site(branch_df, gff3, organism=None): ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) three_coord = [] for ix, r in branch_df.iterrows(): introns = ss_dict[r['transcript'][:-2]] matched = False for intron in introns: if r['5p splice site'] in range(intron[0]-1,intron[0]+2): three_coord.append(intron[1]) matched = True break if matched is False: three_coord.append(np.NaN) branch_df['3p splice site'] = three_coord branch_df['intron size'] = branch_df['5p splice site']-branch_df['3p splice site'] branch_df['intron size'] = branch_df['intron size'].apply(abs) branch_df['Branch to 3p distance'] = branch_df['branch site']-branch_df['3p splice site'] branch_df['Branch to 3p distance'] = branch_df['Branch to 3p distance'].apply(abs) return branch_df
def build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=None): junction_dict = {} transcript_by_chr = {} unassigned_count = 0 ss_dict, flag = SP.list_splice_sites(gff3_file, organism=organism) ss_by_gene = SP.collapse_ss_dict(ss_dict) ss_by_gene = {k:v for k, v in ss_by_gene.items() if len(v) > 0} for transcript, coords in transcript_dict.iteritems(): if transcript[:-2] in ss_by_gene.keys(): chromosome = coords[3] if chromosome in transcript_by_chr: transcript_by_chr[chromosome].append(transcript) else: transcript_by_chr[chromosome] = [] transcript_by_chr[chromosome].append(transcript) a = 0 with open(junction_bed, 'r') as fin: for line in fin: a += 1 jct_transcript = None jct_type = 'Other' intron_num = None if line.startswith('c') or line.startswith('I'): columns = line.split('\t') lat_rom = {'I':'chr1', 'II':'chr2', 'III':'chr3'} chromosome = columns[0] if chromosome in lat_rom: chromosome = lat_rom[chromosome] if chromosome in transcript_by_chr: transcript_list = transcript_by_chr[chromosome] strand = columns[5] if strand == '+': jct_start = int(columns[1])+int(columns[10].split(',')[0])-1 jct_end = int(columns[2])-int(columns[10].split(',')[1])-1 elif strand == '-': jct_start = int(columns[2])-int(columns[10].split(',')[1]) jct_end = int(columns[1])+int(columns[10].split(',')[0]) depth = int(columns[4]) size = abs(jct_end-jct_start) if depth >= 5: assigned = False for transcript in transcript_list: if jct_start > transcript_dict[transcript][0] and jct_end < transcript_dict[transcript][1] and strand == transcript_dict[transcript][2]: assigned = True jct_transcript = transcript all_sites = zip(*ss_by_gene[transcript[:-2]]) try: if jct_start in all_sites[0] and jct_end in all_sites[1]: jct_type = 'Annotated' ann_size = size intron_num = all_sites[0].index(jct_start)+1 ann_start = jct_start ann_stop = jct_end break else: n=0 for intron in ss_by_gene[transcript[:-2]]: n += 1 ann_size = None if strand == '+': if jct_start > intron[0] and jct_end < intron[1]: ann_size = abs(intron[1]-intron[0]) jct_type = 'Nested' ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start >= intron[0] and jct_end == intron[1]: ann_size = abs(intron[1]-intron[0]) jct_type = '3p tethered' ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start == intron[0] and jct_end <= intron[1]: ann_size = abs(intron[1]-intron[0]) jct_type = '5p tethered' ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif strand == '-': if jct_start < intron[0] and jct_end > intron[1]: jct_type = 'Nested' ann_size = intron[0]-intron[1] ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start <= intron[0] and jct_end == intron[1]: jct_type = '5p tethered' ann_size = intron[0]-intron[1] ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start == intron[0] and jct_end >= intron[1]: jct_type = '3p tethered' ann_size = intron[0]-intron[1] ann_start = intron[0] ann_stop = intron[1] intron_num = n break break except IndexError: print transcript if assigned is False: unassigned_count += 1 try: if jct_transcript != None: if ann_size == None: jct_type = "Other" ann_size = 0 ann_start = None ann_stop = None intron_num = None if (jct_transcript, ann_size) not in junction_dict: junction_dict[(jct_transcript, ann_size)] = [] junction_dict[(jct_transcript, ann_size)].append([chromosome, jct_start, jct_end, strand, depth, jct_type, size, ann_size, ann_start, ann_stop]) except ValueError: print jct_transcript print jct_type print str(unassigned_count)+' junctions not assigned to transcripts' return junction_dict
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]} new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position']-5,coord_df['position']+5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3=True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site-five_site)/1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6] s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome', 'strand','transcript','position','type']] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[], 'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[], 'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x:x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak)-5,int(peak)+5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx+'.1' else: iso = tx+'T0' column_dict['intron size'].append(abs(three_site-five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n+1][0] - three_site)/1000. try: if ds_length < 0: ds_length = (splice_sites[n+2][0] - three_site)/1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site)/1000. elif strand == '-': column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n+1][0])/1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n+2][0])/1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0])/1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites)-1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) if strand == '+': column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.) elif strand == '-': column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.) column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.) if in_df is True: peak_index = chrom+':'+str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index,'3p score']) column_dict['5p score'].append(df.loc[peak_index,'5p score']) column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing']) column_dict['type'].append(df.loc[peak_index,'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index,'seq5']) column_dict['seq3'].append(df.loc[peak_index,'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site+1) new_index.append(chrom+':'+str(five_site+1)) sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)] sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)] elif strand == '-': column_dict['position'].append(five_site-1) new_index.append(chrom+':'+str(five_site-1)) sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None): transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism) if type(fasta_file) == dict: fasta_dict = fasta_file elif fasta_file.endswith('json'): with open(fasta_file, 'r') as f: fasta_dict = json.load(f) else: fasta_dict = make_fasta_dict(fasta_file) if ss_dict is not None: ss_dict = ss_dict elif junction_bed is not None: ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism) elif peak_df is not None: ss_dict = {} peak_df = peak_df[~peak_df['type'].str.contains('prime')] for ix, r in peak_df.iterrows(): if r['transcript'] not in ss_dict: ss_dict[r['transcript']] = [] if r['strand'] == '+': ss_dict[r['transcript']].append( (r['position'], r['position'] + 50)) elif r['strand'] == '-': ss_dict[r['transcript']].append( (r['position'], r['position'] - 50)) else: ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) seq_dict = {} for transcript, introns in ss_dict.iteritems(): if junction_bed is None: if organism == 'pombe': transcript = transcript + '.1' else: transcript = transcript + 'T0' introns = list(introns) strand = transcript_dict[transcript][2] chrom = transcript_dict[transcript][3] n = 0 for n in range(len(introns)): if strand == '+': seq_dict[transcript + '-' + chrom + ':' + str(introns[n][0] + 1)] = fasta_dict[chrom][introns[n][0] + 2:introns[n][0] + 17] elif strand == '-': seq = fasta_dict[chrom][introns[n][0] - 16:introns[n][0] - 1] seq_dict[transcript + '-' + chrom + ':' + str(introns[n][0])] = SP.reverse_complement(seq) return seq_dict
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False): #Populate gene dictionary and build genome if 'pombe' in gff3.lower(): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') ss, flag = SP.list_splice_sites(gff3, organism='pombe') organism = 'pombe' else: transcript_dict = SP.build_transcript_dict(gff3) ss, flag = SP.list_splice_sites(gff3) organism = None ss_dict = SP.collapse_ss_dict(ss) genome = fasta_dict #print genome.keys() nuc_prob = gc_content(fasta_dict) #print nuc_prob base_dict = {"A":0, "C":1, "T":2, "G":3} #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G. pos_matrix_5prime = np.zeros([4,8]) pos_matrix_3prime = np.zeros([4,8]) counter1 = 0 counter2 = 0 for transcript, introns in ss_dict.iteritems(): counter2 += 1 if organism == 'pombe': isoform = transcript+'.1' else: isoform = transcript+'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: counter1+=1 if strand == '+': seq = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)] elif strand == '-': seq = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)] seq = SP.reverse_complement(seq) for a, base in enumerate(seq): pos_matrix_5prime[base_dict[base],a] += 1 if strand == '+': seq = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)] elif strand == '-': seq = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)] seq = SP.reverse_complement(seq) for b, base in enumerate(seq): pos_matrix_3prime[base_dict[base],b] += 1 #print counter1 #print counter2 float_formatter = lambda x: "%.1f" % x np.set_printoptions(formatter={'float_kind':float_formatter}) a = 0 while a < 4: b = 0 while b < 8: if PSSM is False: pos_matrix_5prime[a,b] = (pos_matrix_5prime[a,b])/float(counter1) pos_matrix_3prime[a,b] = (pos_matrix_3prime[a,b])/float(counter1) if PSSM is True: if pos_matrix_5prime[a,b] == 0: pos_matrix_5prime[a,b] += 1 if pos_matrix_3prime[a,b] == 0: pos_matrix_3prime[a,b] += 1 pos_matrix_5prime[a,b] = np.log2((pos_matrix_5prime[a,b]/float(counter1))/nuc_prob[a]) pos_matrix_3prime[a,b] = np.log2((pos_matrix_3prime[a,b]/float(counter1))/nuc_prob[a]) b += 1 a += 1 return (pos_matrix_5prime, pos_matrix_3prime)
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None): count1 = 0 count2 = 0 pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')] quant_df['genome coord'] = quant_df['chromosome'].str.cat( quant_df['position'].values.astype(str), sep=':') quant_df.index = quant_df['genome coord'] quant_df = quant_df.drop('index', axis=1) column_dict = { 'intron size': [], 'alt splicing': [], '5p score': [], '3p score': [], 'seq5': [], 'seq3': [] } new_index = [] seq5 = [] seq3 = [] for coord in quant_df.index: coord_df = quant_df[quant_df.index == coord] three_site = None alt3 = False if len(coord_df) > 0: coord_df = coord_df.sort_values('height', ascending=False).ix[0] introns = ss_dict[coord_df['transcript']] if 'prime' in coord_df['type']: peak_range = range(coord_df['position'] - 5, coord_df['position'] + 5) for intron in introns: if intron[0] in peak_range: five_site = intron[0] three_site = intron[1] break if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0: alt3 = True else: if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']: five_site = coord_df['position'] three_df = quant_df[ (quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')] three_df = three_df.sort_values('height', ascending=False) three_site = three_df.ix[0]['position'] if three_site is not None: new_index.append(coord) size = abs(three_site - five_site) / 1000. column_dict['intron size'].append(size) column_dict['alt splicing'].append(alt3) if coord_df['strand'] == '+': s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site + 6] s3 = fa_dict[coord_df['chromosome']][three_site - 6:three_site + 2] elif coord_df['strand'] == '-': s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site + 2] s5 = SP.reverse_complement(s5) s3 = fa_dict[coord_df['chromosome']][three_site - 2:three_site + 6] s3 = SP.reverse_complement(s3) column_dict['seq5'].append(s5) column_dict['seq3'].append(s3) scores = SP.simple_score_junction(s5, s3, pssm) column_dict['3p score'].append(scores[1]) column_dict['5p score'].append(scores[0]) new_quant_df = quant_df[quant_df.index.isin(new_index)][[ 'genome coord', 'chromosome', 'strand', 'transcript', 'position', 'type' ]] for column, data in column_dict.iteritems(): new_quant_df[column] = data new_quant_df = new_quant_df.drop_duplicates( subset='genome coord', keep='first').set_index('genome coord') new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism) #for n in range(len(new_quant_df['seq5'].iloc[0])): # new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']] #for n in range(len(new_quant_df['seq3'].iloc[0])): # new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']] #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1) new_quant_df = SP.find_score_branches_ppy( new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict) return new_quant_df
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None): tx_dict = SP.build_transcript_dict(gff3, organism=organism) ss_dict, flag = SP.list_splice_sites(gff3, organism=organism) ss_dict = SP.collapse_ss_dict(ss_dict) column_dict = { 'position': [], 'transcript': [], 'alt splicing': [], 'type': [], 'strand': [], 'introns in transcript': [], 'intron size': [], 'chromosome': [], '5p score': [], '3p score': [], 'intron position': [], 'exon size (us)': [], 'exon size (ds)': [], 'transcript size': [], 'peak': [], 'seq5': [], 'seq3': [] } new_index = [] for tx in set(df['transcript']): strand = df[df['transcript'] == tx].iloc[0]['strand'] splice_sites = ss_dict[tx] if strand == '+': splice_sites = sorted(list(splice_sites), key=lambda x: x[0]) elif strand == '-': splice_sites = sorted(list(splice_sites), key=lambda x: x[0], reverse=True) df_pos = None for n, (five_site, three_site) in enumerate(splice_sites): # Check if already in dataframe in_df = False for peak in df[df['transcript'] == tx]['position']: if five_site in range(int(peak) - 5, int(peak) + 5): in_df = True df_pos = peak break column_dict['transcript'].append(tx) if organism == 'pombe': iso = tx + '.1' else: iso = tx + 'T0' column_dict['intron size'].append(abs(three_site - five_site)) column_dict['introns in transcript'].append(len(splice_sites)) column_dict['strand'].append(strand) chrom = df[df['transcript'] == tx].iloc[0]['chromosome'] column_dict['chromosome'].append(chrom) column_dict['transcript size'].append( (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.) # Check if first or last intron and add exon size if n == 0: column_dict['intron position'].append('First') if strand == '+': column_dict['exon size (us)'].append( (five_site - tx_dict[iso][0]) / 1000.) if len(splice_sites) > 1: ds_length = (splice_sites[n + 1][0] - three_site) / 1000. try: if ds_length < 0: ds_length = (splice_sites[n + 2][0] - three_site) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (tx_dict[iso][1] - three_site) / 1000. elif strand == '-': column_dict['exon size (us)'].append( (tx_dict[iso][1] - five_site) / 1000.) if len(splice_sites) > 1: ds_length = (three_site - splice_sites[n + 1][0]) / 1000. try: if ds_length < 0: ds_length = (three_site - splice_sites[n + 2][0]) / 1000. except IndexError: ds_length = np.NaN else: ds_length = (three_site - tx_dict[iso][0]) / 1000. column_dict['exon size (ds)'].append(ds_length) elif n == len(splice_sites) - 1: column_dict['intron position'].append('Last') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) if strand == '+': column_dict['exon size (ds)'].append( (tx_dict[iso][1] - three_site) / 1000.) elif strand == '-': column_dict['exon size (ds)'].append( (three_site - tx_dict[iso][0]) / 1000.) else: column_dict['intron position'].append('Middle') column_dict['exon size (us)'].append( (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.) column_dict['exon size (ds)'].append( abs(three_site - splice_sites[n + 1][0]) / 1000.) if in_df is True: peak_index = chrom + ':' + str(int(df_pos)) new_index.append(peak_index) column_dict['position'].append(df_pos) column_dict['3p score'].append(df.loc[peak_index, '3p score']) column_dict['5p score'].append(df.loc[peak_index, '5p score']) column_dict['alt splicing'].append(df.loc[peak_index, 'alt splicing']) column_dict['type'].append(df.loc[peak_index, 'type']) column_dict['peak'].append(True) column_dict['seq5'].append(df.loc[peak_index, 'seq5']) column_dict['seq3'].append(df.loc[peak_index, 'seq3']) if in_df is False: column_dict['alt splicing'].append(False) column_dict['type'].append('5prime') column_dict['peak'].append(False) # Get position, index and sequence for scoring and position code if strand == '+': column_dict['position'].append(five_site + 1) new_index.append(chrom + ':' + str(five_site + 1)) sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)] sequence2 = fa_dict[chrom][(three_site - 5):(three_site + 3)] elif strand == '-': column_dict['position'].append(five_site - 1) new_index.append(chrom + ':' + str(five_site - 1)) sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)] sequence1 = SP.reverse_complement(sequence1) sequence2 = fa_dict[chrom][(three_site - 2):(three_site + 6)] sequence2 = SP.reverse_complement(sequence2) column_dict['seq5'].append(sequence1) column_dict['seq3'].append(sequence2) # Score sequences score_5, score_3 = SP.simple_score_junction( sequence1, sequence2, PSSM) column_dict['3p score'].append(score_3) column_dict['5p score'].append(score_5) # Create new dataframe from column dictionary new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index) for column, data in column_dict.iteritems(): new_df[column] = data return new_df
def build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=None): junction_dict = {} transcript_by_chr = {} unassigned_count = 0 ss_dict, flag = SP.list_splice_sites(gff3_file, organism=organism) ss_by_gene = SP.collapse_ss_dict(ss_dict) ss_by_gene = {k: v for k, v in ss_by_gene.items() if len(v) > 0} for transcript, coords in transcript_dict.iteritems(): if transcript[:-2] in ss_by_gene.keys(): chromosome = coords[3] if chromosome in transcript_by_chr: transcript_by_chr[chromosome].append(transcript) else: transcript_by_chr[chromosome] = [] transcript_by_chr[chromosome].append(transcript) a = 0 with open(junction_bed, 'r') as fin: for line in fin: a += 1 jct_transcript = None jct_type = 'Other' intron_num = None if line.startswith('c') or line.startswith('I'): columns = line.split('\t') lat_rom = {'I': 'chr1', 'II': 'chr2', 'III': 'chr3'} chromosome = columns[0] if chromosome in lat_rom: chromosome = lat_rom[chromosome] if chromosome in transcript_by_chr: transcript_list = transcript_by_chr[chromosome] strand = columns[5] if strand == '+': jct_start = int(columns[1]) + int( columns[10].split(',')[0]) - 1 jct_end = int(columns[2]) - int( columns[10].split(',')[1]) - 1 elif strand == '-': jct_start = int(columns[2]) - int( columns[10].split(',')[1]) jct_end = int(columns[1]) + int(columns[10].split(',')[0]) depth = int(columns[4]) size = abs(jct_end - jct_start) if depth >= 5: assigned = False for transcript in transcript_list: if jct_start > transcript_dict[transcript][ 0] and jct_end < transcript_dict[transcript][ 1] and strand == transcript_dict[ transcript][2]: assigned = True jct_transcript = transcript all_sites = zip(*ss_by_gene[transcript[:-2]]) try: if jct_start in all_sites[ 0] and jct_end in all_sites[1]: jct_type = 'Annotated' ann_size = size intron_num = all_sites[0].index( jct_start) + 1 ann_start = jct_start ann_stop = jct_end break else: n = 0 for intron in ss_by_gene[transcript[:-2]]: n += 1 ann_size = None if strand == '+': if jct_start > intron[ 0] and jct_end < intron[1]: ann_size = abs(intron[1] - intron[0]) jct_type = 'Nested' ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start >= intron[ 0] and jct_end == intron[1]: ann_size = abs(intron[1] - intron[0]) jct_type = '3p tethered' ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start == intron[ 0] and jct_end <= intron[1]: ann_size = abs(intron[1] - intron[0]) jct_type = '5p tethered' ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif strand == '-': if jct_start < intron[ 0] and jct_end > intron[1]: jct_type = 'Nested' ann_size = intron[0] - intron[1] ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start <= intron[ 0] and jct_end == intron[1]: jct_type = '5p tethered' ann_size = intron[0] - intron[1] ann_start = intron[0] ann_stop = intron[1] intron_num = n break elif jct_start == intron[ 0] and jct_end >= intron[1]: jct_type = '3p tethered' ann_size = intron[0] - intron[1] ann_start = intron[0] ann_stop = intron[1] intron_num = n break break except IndexError: print transcript if assigned is False: unassigned_count += 1 try: if jct_transcript != None: if ann_size == None: jct_type = "Other" ann_size = 0 ann_start = None ann_stop = None intron_num = None if (jct_transcript, ann_size) not in junction_dict: junction_dict[(jct_transcript, ann_size)] = [] junction_dict[(jct_transcript, ann_size)].append([ chromosome, jct_start, jct_end, strand, depth, jct_type, size, ann_size, ann_start, ann_stop ]) except ValueError: print jct_transcript print jct_type print str(unassigned_count) + ' junctions not assigned to transcripts' return junction_dict
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False): #Populate gene dictionary and build genome if 'pombe' in gff3.lower(): transcript_dict = SP.build_transcript_dict(gff3, organism='pombe') ss, flag = SP.list_splice_sites(gff3, organism='pombe') organism = 'pombe' else: transcript_dict = SP.build_transcript_dict(gff3) ss, flag = SP.list_splice_sites(gff3) organism = None ss_dict = SP.collapse_ss_dict(ss) genome = fasta_dict #print genome.keys() nuc_prob = gc_content(fasta_dict) #print nuc_prob base_dict = {"A": 0, "C": 1, "T": 2, "G": 3} #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G. pos_matrix_5prime = np.zeros([4, 8]) pos_matrix_3prime = np.zeros([4, 8]) counter1 = 0 counter2 = 0 for transcript, introns in ss_dict.iteritems(): counter2 += 1 if organism == 'pombe': isoform = transcript + '.1' else: isoform = transcript + 'T0' strand = transcript_dict[isoform][2] chrom = transcript_dict[isoform][3] for intron in introns: counter1 += 1 if strand == '+': seq = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)] elif strand == '-': seq = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)] seq = SP.reverse_complement(seq) for a, base in enumerate(seq): pos_matrix_5prime[base_dict[base], a] += 1 if strand == '+': seq = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)] elif strand == '-': seq = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)] seq = SP.reverse_complement(seq) for b, base in enumerate(seq): pos_matrix_3prime[base_dict[base], b] += 1 #print counter1 #print counter2 float_formatter = lambda x: "%.1f" % x np.set_printoptions(formatter={'float_kind': float_formatter}) a = 0 while a < 4: b = 0 while b < 8: if PSSM is False: pos_matrix_5prime[a, b] = (pos_matrix_5prime[a, b]) / float(counter1) pos_matrix_3prime[a, b] = (pos_matrix_3prime[a, b]) / float(counter1) if PSSM is True: if pos_matrix_5prime[a, b] == 0: pos_matrix_5prime[a, b] += 1 if pos_matrix_3prime[a, b] == 0: pos_matrix_3prime[a, b] += 1 pos_matrix_5prime[a, b] = np.log2( (pos_matrix_5prime[a, b] / float(counter1)) / nuc_prob[a]) pos_matrix_3prime[a, b] = np.log2( (pos_matrix_3prime[a, b] / float(counter1)) / nuc_prob[a]) b += 1 a += 1 return (pos_matrix_5prime, pos_matrix_3prime)