Пример #1
0
def add_intron_size(peaks_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    no_peaks = ss_dict
    intron_sizes = []
    for index, row in peaks_df.iterrows():
        if row['type'] != 'intronic':
            intron_sizes.append(np.NaN)
        else:
            sites = ss_dict[row['transcript']]
            assigned=False
            for pair in sites:
                if pair[0] > pair[1]:
                    if row['position'] >= pair[1] and row['position'] <= pair[0]:
                        intron_sizes.append(pair[0]-pair[1])
                        assigned=True
                        no_peaks[row['transcript']].remove(pair)
                        break
                else:
                    if row['position'] >= pair[0] and row['position'] <= pair[1]:
                        intron_sizes.append(pair[1]-pair[0])
                        assigned=True
                        no_peaks[row['transcript']].remove(pair)
                        break
            if assigned is False:
                intron_sizes.append(np.NaN)
    peaks_df['intron size'] = intron_sizes
    return peaks_df,  no_peaks
Пример #2
0
def add_intron_size(peaks_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    no_peaks = ss_dict
    intron_sizes = []
    for index, row in peaks_df.iterrows():
        if row['type'] != 'intronic':
            intron_sizes.append(np.NaN)
        else:
            sites = ss_dict[row['transcript']]
            assigned = False
            for pair in sites:
                if pair[0] > pair[1]:
                    if row['position'] >= pair[1] and row['position'] <= pair[
                            0]:
                        intron_sizes.append(pair[0] - pair[1])
                        assigned = True
                        no_peaks[row['transcript']].remove(pair)
                        break
                else:
                    if row['position'] >= pair[0] and row['position'] <= pair[
                            1]:
                        intron_sizes.append(pair[1] - pair[0])
                        assigned = True
                        no_peaks[row['transcript']].remove(pair)
                        break
            if assigned is False:
                intron_sizes.append(np.NaN)
    peaks_df['intron size'] = intron_sizes
    return peaks_df, no_peaks
Пример #3
0
def find_3p_site(branch_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    three_coord = []
    for ix, r in branch_df.iterrows():
        introns = ss_dict[r['transcript'][:-2]]
        matched = False
        for intron in introns:
            if r['5p splice site'] in range(intron[0] - 1, intron[0] + 2):
                three_coord.append(intron[1])
                matched = True
                break
        if matched is False:
            three_coord.append(np.NaN)

    branch_df['3p splice site'] = three_coord
    branch_df['intron size'] = branch_df['5p splice site'] - branch_df[
        '3p splice site']
    branch_df['intron size'] = branch_df['intron size'].apply(abs)
    branch_df['Branch to 3p distance'] = branch_df['branch site'] - branch_df[
        '3p splice site']
    branch_df['Branch to 3p distance'] = branch_df[
        'Branch to 3p distance'].apply(abs)

    return branch_df
Пример #4
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)
    
    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript+'.1'
        else:
            isoform = transcript+'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)]
                seq3 = SP.reverse_complement(seq3)
            
            all_seq3.append(seq3)
    return all_seq5, all_seq3
Пример #5
0
def generate_all_ss_seqs(gff3, fasta_dict, organism):
    transcript_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss)

    all_seq5 = []
    all_seq3 = []
    for transcript, introns in ss_dict.iteritems():
        if organism == 'pombe':
            isoform = transcript + '.1'
        else:
            isoform = transcript + 'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            if strand == '+':
                seq5 = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)]
            elif strand == '-':
                seq5 = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)]
                seq5 = SP.reverse_complement(seq5)

            all_seq5.append(seq5)

            if strand == '+':
                seq3 = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)]
            elif strand == '-':
                seq3 = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)]
                seq3 = SP.reverse_complement(seq3)

            all_seq3.append(seq3)
    return all_seq5, all_seq3
Пример #6
0
def collect_intron_seq(gff3_file, fasta_file, ss_dict=None, junction_bed=None, gene_list=None, peak_df=None, organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) == dict:
        fasta_dict = fasta_file
    elif fasta_file.endswith('json'):
        with open(fasta_file, 'r') as f:
            fasta_dict = json.load(f)
    else:
        fasta_dict = make_fasta_dict(fasta_file)
    if ss_dict is not None:
        ss_dict=ss_dict
    elif junction_bed is not None:
        ss_dict = SP.build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=organism)
    elif peak_df is not None:
        ss_dict = {}
        peak_df = peak_df[~peak_df['type'].str.contains('prime')]
        for ix, r in peak_df.iterrows():
            if r['transcript'] not in ss_dict:
                ss_dict[r['transcript']] = []
            if r['strand'] == '+':
                ss_dict[r['transcript']].append((r['position'],r['position']+50))
            elif r['strand'] == '-':
                ss_dict[r['transcript']].append((r['position'],r['position']-50))
                
    else:
        ss_dict, intron_flag = SP.list_splice_sites(gff3_file, gene_list=gene_list, organism=organism)
        ss_dict = SP.collapse_ss_dict(ss_dict)
    
    seq_dict = {}
    for transcript, introns in ss_dict.iteritems():
        if junction_bed is None:
            if organism == 'pombe':
                transcript = transcript+'.1'
            else:
                transcript = transcript+'T0'
        introns = list(introns)
        strand = transcript_dict[transcript][2]
        chrom = transcript_dict[transcript][3]
        n = 0
        for n in range(len(introns)):
            if strand == '+':
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0]+1)] = fasta_dict[chrom][introns[n][0]+2:introns[n][0]+17]
            elif strand == '-':
                seq = fasta_dict[chrom][introns[n][0]-16:introns[n][0]-1]
                seq_dict[transcript+'-'+chrom+':'+str(introns[n][0])] = SP.reverse_complement(seq)
    return seq_dict
Пример #7
0
def check_intron_position(transcript, position, gff3, organism):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    first=False
    last=False
    
    introns = ss_dict[transcript]
    
    for n, intron in enumerate(introns):
        if intron[0] in range(position-3,position+3):
            if n == 0:
                first = True
            elif n == len(intron):
                last = True
            break
    return first, last
Пример #8
0
def check_intron_position(transcript, position, gff3, organism):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    first = False
    last = False

    introns = ss_dict[transcript]

    for n, intron in enumerate(introns):
        if intron[0] in range(position - 3, position + 3):
            if n == 0:
                first = True
            elif n == len(intron):
                last = True
            break
    return first, last
Пример #9
0
def find_3p_site(branch_df, gff3, organism=None):
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    three_coord = []
    for ix, r in branch_df.iterrows():
        introns = ss_dict[r['transcript'][:-2]]
        matched = False
        for intron in introns:
            if r['5p splice site'] in range(intron[0]-1,intron[0]+2):
                three_coord.append(intron[1])
                matched = True
                break
        if matched is False:
            three_coord.append(np.NaN)
    
    branch_df['3p splice site'] = three_coord
    branch_df['intron size'] = branch_df['5p splice site']-branch_df['3p splice site']
    branch_df['intron size'] = branch_df['intron size'].apply(abs)
    branch_df['Branch to 3p distance'] = branch_df['branch site']-branch_df['3p splice site']
    branch_df['Branch to 3p distance'] = branch_df['Branch to 3p distance'].apply(abs)
    
    return branch_df
Пример #10
0
def build_junction_dict(junction_bed, gff3_file, transcript_dict, organism=None):
    junction_dict = {}
    transcript_by_chr = {}
    unassigned_count = 0
    
    ss_dict, flag = SP.list_splice_sites(gff3_file, organism=organism)   
    ss_by_gene = SP.collapse_ss_dict(ss_dict)
    ss_by_gene = {k:v for k, v in ss_by_gene.items() if len(v) > 0}
    
    for transcript, coords in transcript_dict.iteritems():
        if transcript[:-2] in ss_by_gene.keys():
            chromosome = coords[3]
            if chromosome in transcript_by_chr:
                transcript_by_chr[chromosome].append(transcript)
            else:
                transcript_by_chr[chromosome] = []
                transcript_by_chr[chromosome].append(transcript)

    a = 0
    with open(junction_bed, 'r') as fin:
        for line in fin:
            a += 1
            jct_transcript = None
            jct_type = 'Other'
            intron_num = None
            if line.startswith('c') or line.startswith('I'):
                columns = line.split('\t')
                lat_rom = {'I':'chr1', 'II':'chr2', 'III':'chr3'}
                chromosome = columns[0]
                if chromosome in lat_rom:
                    chromosome = lat_rom[chromosome]
                
                if chromosome in transcript_by_chr:
                    transcript_list = transcript_by_chr[chromosome]
                
                strand = columns[5]
                if strand == '+':
                    jct_start = int(columns[1])+int(columns[10].split(',')[0])-1
                    jct_end = int(columns[2])-int(columns[10].split(',')[1])-1
                elif strand == '-':
                    jct_start = int(columns[2])-int(columns[10].split(',')[1])
                    jct_end = int(columns[1])+int(columns[10].split(',')[0])
                depth = int(columns[4])
                size = abs(jct_end-jct_start)
                
                if depth >= 5:
                    assigned = False
                    for transcript in transcript_list:
                        if jct_start > transcript_dict[transcript][0] and jct_end < transcript_dict[transcript][1] and strand == transcript_dict[transcript][2]:
                            assigned = True
                            jct_transcript = transcript
                            all_sites = zip(*ss_by_gene[transcript[:-2]])
                            try:
                                if jct_start in all_sites[0] and jct_end in all_sites[1]:
                                    jct_type = 'Annotated'
                                    ann_size = size
                                    intron_num = all_sites[0].index(jct_start)+1
                                    ann_start = jct_start
                                    ann_stop = jct_end
                                    break
                                else:
                                    n=0
                                    for intron in ss_by_gene[transcript[:-2]]:
                                        n += 1
                                        ann_size = None
                                        if strand == '+':
                                            if jct_start > intron[0] and jct_end < intron[1]:
                                                ann_size = abs(intron[1]-intron[0])
                                                jct_type = 'Nested'
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start >= intron[0] and jct_end == intron[1]:
                                                ann_size = abs(intron[1]-intron[0])
                                                jct_type = '3p tethered'
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start == intron[0] and jct_end <= intron[1]:
                                                ann_size = abs(intron[1]-intron[0])
                                                jct_type = '5p tethered'
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break

                                        elif strand == '-':
                                            if jct_start < intron[0] and jct_end > intron[1]:
                                                jct_type = 'Nested'
                                                ann_size = intron[0]-intron[1]
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start <= intron[0] and jct_end == intron[1]:
                                                jct_type = '5p tethered'
                                                ann_size = intron[0]-intron[1]
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start == intron[0] and jct_end >= intron[1]:
                                                jct_type = '3p tethered'
                                                ann_size = intron[0]-intron[1]
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                break
                            except IndexError:
                                print transcript
                    if assigned is False: unassigned_count += 1

                    try:
                        if jct_transcript != None:
                            if ann_size == None:
                                jct_type = "Other"
                                ann_size = 0
                                ann_start = None
                                ann_stop = None
                                intron_num = None
                            if (jct_transcript, ann_size) not in junction_dict:
                                junction_dict[(jct_transcript, ann_size)] = []
                            junction_dict[(jct_transcript, ann_size)].append([chromosome, jct_start, jct_end, strand, depth, jct_type, size, ann_size, ann_start, ann_stop])
                    except ValueError:
                        print jct_transcript
                        print jct_type

    print str(unassigned_count)+' junctions not assigned to transcripts'
    return junction_dict
Пример #11
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0
    
    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)
    
    quant_df = peak_df[(peak_df['type'] != '3prime') & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)
    
    column_dict = {'intron size':[], 'alt splicing':[], '5p score':[], '3p score':[], 'seq5':[], 'seq3':[]}
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position']-5,coord_df['position']+5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]) > 0:
                alt3=True
        else:
            if 'AG' in quant_df[quant_df['transcript'] == coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[(quant_df['transcript'] == coord_df['transcript']) & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']
        
        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site-five_site)/1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)
            
            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site-2:five_site+6]
                s3 = fa_dict[coord_df['chromosome']][three_site-6:three_site+2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site-6:five_site+2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site-2:three_site+6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])
            
    new_quant_df = quant_df[quant_df.index.isin(new_index)][['genome coord','chromosome',
                                                             'strand','transcript','position','type']]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data
    
    new_quant_df = new_quant_df.drop_duplicates(subset='genome coord', keep='first').set_index('genome coord')
    
    new_quant_df = SP.backfill_splice_sites(new_quant_df, gff3, fa_dict, pssm, organism=organism)
    
    #for n in range(len(new_quant_df['seq5'].iloc[0])):     
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)
    
    new_quant_df = SP.find_score_branches_ppy(new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt', fa_dict)
    
    return new_quant_df
Пример #12
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {'position':[],'transcript':[],'alt splicing':[],'type':[],'strand':[], 'introns in transcript':[],
                   'intron size':[],'chromosome':[], '5p score':[], '3p score':[], 'intron position':[], 'exon size (us)':[],
                   'exon size (ds)':[],'transcript size':[], 'peak':[], 'seq5':[],'seq3':[]} 
    new_index = []
    
    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites), key=lambda x:x[0], reverse=True)
        
        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak)-5,int(peak)+5):
                    in_df = True
                    df_pos = peak
                    break
            
            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx+'.1'
            else: iso = tx+'T0'
            
            column_dict['intron size'].append(abs(three_site-five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)   
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append((tx_dict[iso][1]-tx_dict[iso][0])/1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append((five_site-tx_dict[iso][0])/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n+1][0] - three_site)/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n+2][0] - three_site)/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site)/1000.
                    
                elif strand == '-':
                    column_dict['exon size (us)'].append((tx_dict[iso][1]-five_site)/1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site - splice_sites[n+1][0])/1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site - splice_sites[n+2][0])/1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0])/1000.
                column_dict['exon size (ds)'].append(ds_length)
            
            elif n == len(splice_sites)-1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                
                if strand == '+':
                    column_dict['exon size (ds)'].append((tx_dict[iso][1]-three_site)/1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append((three_site - tx_dict[iso][0])/1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append((abs(five_site-splice_sites[n-1][1])-1)/1000.)
                column_dict['exon size (ds)'].append(abs(three_site - splice_sites[n+1][0])/1000.)

            if in_df is True:
                peak_index = chrom+':'+str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index,'3p score'])
                column_dict['5p score'].append(df.loc[peak_index,'5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,'alt splicing'])
                column_dict['type'].append(df.loc[peak_index,'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index,'seq5'])
                column_dict['seq3'].append(df.loc[peak_index,'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)
                
                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site+1)
                    new_index.append(chrom+':'+str(five_site+1))
                    sequence1 = fa_dict[chrom][(five_site-1):(five_site+7)]
                    sequence2 = fa_dict[chrom][(three_site-5):(three_site+3)]
                
                elif strand == '-':
                    column_dict['position'].append(five_site-1)
                    new_index.append(chrom+':'+str(five_site-1))
                    sequence1 = fa_dict[chrom][(five_site-6):(five_site+2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site-2):(three_site+6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)
                
                # Score sequences
                score_5, score_3 = SP.simple_score_junction(sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)
    
    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data
    
    return new_df
Пример #13
0
def collect_intron_seq(gff3_file,
                       fasta_file,
                       ss_dict=None,
                       junction_bed=None,
                       gene_list=None,
                       peak_df=None,
                       organism=None):
    transcript_dict = SP.build_transcript_dict(gff3_file, organism=organism)
    if type(fasta_file) == dict:
        fasta_dict = fasta_file
    elif fasta_file.endswith('json'):
        with open(fasta_file, 'r') as f:
            fasta_dict = json.load(f)
    else:
        fasta_dict = make_fasta_dict(fasta_file)
    if ss_dict is not None:
        ss_dict = ss_dict
    elif junction_bed is not None:
        ss_dict = SP.build_junction_dict(junction_bed,
                                         gff3_file,
                                         transcript_dict,
                                         organism=organism)
    elif peak_df is not None:
        ss_dict = {}
        peak_df = peak_df[~peak_df['type'].str.contains('prime')]
        for ix, r in peak_df.iterrows():
            if r['transcript'] not in ss_dict:
                ss_dict[r['transcript']] = []
            if r['strand'] == '+':
                ss_dict[r['transcript']].append(
                    (r['position'], r['position'] + 50))
            elif r['strand'] == '-':
                ss_dict[r['transcript']].append(
                    (r['position'], r['position'] - 50))

    else:
        ss_dict, intron_flag = SP.list_splice_sites(gff3_file,
                                                    gene_list=gene_list,
                                                    organism=organism)
        ss_dict = SP.collapse_ss_dict(ss_dict)

    seq_dict = {}
    for transcript, introns in ss_dict.iteritems():
        if junction_bed is None:
            if organism == 'pombe':
                transcript = transcript + '.1'
            else:
                transcript = transcript + 'T0'
        introns = list(introns)
        strand = transcript_dict[transcript][2]
        chrom = transcript_dict[transcript][3]
        n = 0
        for n in range(len(introns)):
            if strand == '+':
                seq_dict[transcript + '-' + chrom + ':' +
                         str(introns[n][0] +
                             1)] = fasta_dict[chrom][introns[n][0] +
                                                     2:introns[n][0] + 17]
            elif strand == '-':
                seq = fasta_dict[chrom][introns[n][0] - 16:introns[n][0] - 1]
                seq_dict[transcript + '-' + chrom + ':' +
                         str(introns[n][0])] = SP.reverse_complement(seq)
    return seq_dict
Пример #14
0
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False):
    #Populate gene dictionary and build genome
    if 'pombe' in gff3.lower():
        transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
        ss, flag = SP.list_splice_sites(gff3, organism='pombe')
        organism = 'pombe'
    else:
        transcript_dict = SP.build_transcript_dict(gff3)
        ss, flag = SP.list_splice_sites(gff3)
        organism = None
    ss_dict = SP.collapse_ss_dict(ss)
    genome = fasta_dict
    #print genome.keys()
    nuc_prob = gc_content(fasta_dict)
    #print nuc_prob

    base_dict = {"A":0, "C":1, "T":2, "G":3}
    
    #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G.
    pos_matrix_5prime = np.zeros([4,8])
    pos_matrix_3prime = np.zeros([4,8])

    counter1 = 0
    counter2 = 0

    for transcript, introns in ss_dict.iteritems():
        counter2 += 1
        if organism == 'pombe':
            isoform = transcript+'.1'
        else:
            isoform = transcript+'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            counter1+=1
            if strand == '+':
                seq = fasta_dict[chrom][(intron[0]-1):(intron[0]+7)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[0]-6):(intron[0]+2)]
                seq = SP.reverse_complement(seq)

            for a, base in enumerate(seq):
                pos_matrix_5prime[base_dict[base],a] += 1

            if strand == '+':
                seq = fasta_dict[chrom][(intron[1]-5):(intron[1]+3)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[1]-2):(intron[1]+6)]
                seq = SP.reverse_complement(seq)
            
            for b, base in enumerate(seq):
                pos_matrix_3prime[base_dict[base],b] += 1
                
    #print counter1
    #print counter2

    float_formatter = lambda x: "%.1f" % x
    np.set_printoptions(formatter={'float_kind':float_formatter})
    
    a = 0
    while a < 4:
        b = 0
        while b < 8:
            if PSSM is False:
                pos_matrix_5prime[a,b] = (pos_matrix_5prime[a,b])/float(counter1)
                pos_matrix_3prime[a,b] = (pos_matrix_3prime[a,b])/float(counter1)
            if PSSM is True:
                if pos_matrix_5prime[a,b] == 0: pos_matrix_5prime[a,b] += 1
                if pos_matrix_3prime[a,b] == 0: pos_matrix_3prime[a,b] += 1
                pos_matrix_5prime[a,b] = np.log2((pos_matrix_5prime[a,b]/float(counter1))/nuc_prob[a])
                pos_matrix_3prime[a,b] = np.log2((pos_matrix_3prime[a,b]/float(counter1))/nuc_prob[a])
            b += 1
        a += 1
    
    return (pos_matrix_5prime, pos_matrix_3prime)
Пример #15
0
def quant_from_peak_df(peak_df, gff3, fa_dict, organism=None):
    count1 = 0
    count2 = 0

    pssm = SP.generate_consensus_matrix(gff3, fa_dict, PSSM=True)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    quant_df = peak_df[(peak_df['type'] != '3prime')
                       & (peak_df['looks like'] != 'AG')]
    quant_df['genome coord'] = quant_df['chromosome'].str.cat(
        quant_df['position'].values.astype(str), sep=':')
    quant_df.index = quant_df['genome coord']
    quant_df = quant_df.drop('index', axis=1)

    column_dict = {
        'intron size': [],
        'alt splicing': [],
        '5p score': [],
        '3p score': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []
    seq5 = []
    seq3 = []

    for coord in quant_df.index:
        coord_df = quant_df[quant_df.index == coord]
        three_site = None
        alt3 = False
        if len(coord_df) > 0:
            coord_df = coord_df.sort_values('height', ascending=False).ix[0]
        introns = ss_dict[coord_df['transcript']]
        if 'prime' in coord_df['type']:
            peak_range = range(coord_df['position'] - 5,
                               coord_df['position'] + 5)
            for intron in introns:
                if intron[0] in peak_range:
                    five_site = intron[0]
                    three_site = intron[1]
                    break
            if len(quant_df[(quant_df['transcript'] == coord_df['transcript'])
                            & (quant_df['type'] == 'AG')]) > 0:
                alt3 = True
        else:
            if 'AG' in quant_df[quant_df['transcript'] ==
                                coord_df['transcript']]['type']:
                five_site = coord_df['position']
                three_df = quant_df[
                    (quant_df['transcript'] == coord_df['transcript'])
                    & (quant_df['type'] == 'AG')]
                three_df = three_df.sort_values('height', ascending=False)
                three_site = three_df.ix[0]['position']

        if three_site is not None:
            new_index.append(coord)
            size = abs(three_site - five_site) / 1000.
            column_dict['intron size'].append(size)
            column_dict['alt splicing'].append(alt3)

            if coord_df['strand'] == '+':
                s5 = fa_dict[coord_df['chromosome']][five_site - 2:five_site +
                                                     6]
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     6:three_site + 2]
            elif coord_df['strand'] == '-':
                s5 = fa_dict[coord_df['chromosome']][five_site - 6:five_site +
                                                     2]
                s5 = SP.reverse_complement(s5)
                s3 = fa_dict[coord_df['chromosome']][three_site -
                                                     2:three_site + 6]
                s3 = SP.reverse_complement(s3)
            column_dict['seq5'].append(s5)
            column_dict['seq3'].append(s3)
            scores = SP.simple_score_junction(s5, s3, pssm)
            column_dict['3p score'].append(scores[1])
            column_dict['5p score'].append(scores[0])

    new_quant_df = quant_df[quant_df.index.isin(new_index)][[
        'genome coord', 'chromosome', 'strand', 'transcript', 'position',
        'type'
    ]]
    for column, data in column_dict.iteritems():
        new_quant_df[column] = data

    new_quant_df = new_quant_df.drop_duplicates(
        subset='genome coord', keep='first').set_index('genome coord')

    new_quant_df = SP.backfill_splice_sites(new_quant_df,
                                            gff3,
                                            fa_dict,
                                            pssm,
                                            organism=organism)

    #for n in range(len(new_quant_df['seq5'].iloc[0])):
    #    new_quant_df['Base 5-'+str(n)] = [x[n] for x in new_quant_df['seq5']]
    #for n in range(len(new_quant_df['seq3'].iloc[0])):
    #    new_quant_df['Base 3-'+str(n)] = [x[n] for x in new_quant_df['seq3']]
    #new_quant_df = new_quant_df.drop(['seq5','seq3'], axis=1)

    new_quant_df = SP.find_score_branches_ppy(
        new_quant_df, '/home/jordan/GENOMES/S288C/S288C_branches2.txt',
        fa_dict)

    return new_quant_df
Пример #16
0
def backfill_splice_sites(df, gff3, fa_dict, PSSM, organism=None):
    tx_dict = SP.build_transcript_dict(gff3, organism=organism)
    ss_dict, flag = SP.list_splice_sites(gff3, organism=organism)
    ss_dict = SP.collapse_ss_dict(ss_dict)

    column_dict = {
        'position': [],
        'transcript': [],
        'alt splicing': [],
        'type': [],
        'strand': [],
        'introns in transcript': [],
        'intron size': [],
        'chromosome': [],
        '5p score': [],
        '3p score': [],
        'intron position': [],
        'exon size (us)': [],
        'exon size (ds)': [],
        'transcript size': [],
        'peak': [],
        'seq5': [],
        'seq3': []
    }
    new_index = []

    for tx in set(df['transcript']):
        strand = df[df['transcript'] == tx].iloc[0]['strand']
        splice_sites = ss_dict[tx]
        if strand == '+':
            splice_sites = sorted(list(splice_sites), key=lambda x: x[0])
        elif strand == '-':
            splice_sites = sorted(list(splice_sites),
                                  key=lambda x: x[0],
                                  reverse=True)

        df_pos = None
        for n, (five_site, three_site) in enumerate(splice_sites):
            # Check if already in dataframe
            in_df = False
            for peak in df[df['transcript'] == tx]['position']:
                if five_site in range(int(peak) - 5, int(peak) + 5):
                    in_df = True
                    df_pos = peak
                    break

            column_dict['transcript'].append(tx)
            if organism == 'pombe':
                iso = tx + '.1'
            else:
                iso = tx + 'T0'

            column_dict['intron size'].append(abs(three_site - five_site))
            column_dict['introns in transcript'].append(len(splice_sites))
            column_dict['strand'].append(strand)
            chrom = df[df['transcript'] == tx].iloc[0]['chromosome']
            column_dict['chromosome'].append(chrom)
            column_dict['transcript size'].append(
                (tx_dict[iso][1] - tx_dict[iso][0]) / 1000.)

            # Check if first or last intron and add exon size
            if n == 0:
                column_dict['intron position'].append('First')
                if strand == '+':
                    column_dict['exon size (us)'].append(
                        (five_site - tx_dict[iso][0]) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (splice_sites[n + 1][0] -
                                     three_site) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (splice_sites[n + 2][0] -
                                             three_site) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (tx_dict[iso][1] - three_site) / 1000.

                elif strand == '-':
                    column_dict['exon size (us)'].append(
                        (tx_dict[iso][1] - five_site) / 1000.)
                    if len(splice_sites) > 1:
                        ds_length = (three_site -
                                     splice_sites[n + 1][0]) / 1000.
                        try:
                            if ds_length < 0:
                                ds_length = (three_site -
                                             splice_sites[n + 2][0]) / 1000.
                        except IndexError:
                            ds_length = np.NaN
                    else:
                        ds_length = (three_site - tx_dict[iso][0]) / 1000.
                column_dict['exon size (ds)'].append(ds_length)

            elif n == len(splice_sites) - 1:
                column_dict['intron position'].append('Last')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)

                if strand == '+':
                    column_dict['exon size (ds)'].append(
                        (tx_dict[iso][1] - three_site) / 1000.)
                elif strand == '-':
                    column_dict['exon size (ds)'].append(
                        (three_site - tx_dict[iso][0]) / 1000.)
            else:
                column_dict['intron position'].append('Middle')
                column_dict['exon size (us)'].append(
                    (abs(five_site - splice_sites[n - 1][1]) - 1) / 1000.)
                column_dict['exon size (ds)'].append(
                    abs(three_site - splice_sites[n + 1][0]) / 1000.)

            if in_df is True:
                peak_index = chrom + ':' + str(int(df_pos))
                new_index.append(peak_index)
                column_dict['position'].append(df_pos)
                column_dict['3p score'].append(df.loc[peak_index, '3p score'])
                column_dict['5p score'].append(df.loc[peak_index, '5p score'])
                column_dict['alt splicing'].append(df.loc[peak_index,
                                                          'alt splicing'])
                column_dict['type'].append(df.loc[peak_index, 'type'])
                column_dict['peak'].append(True)
                column_dict['seq5'].append(df.loc[peak_index, 'seq5'])
                column_dict['seq3'].append(df.loc[peak_index, 'seq3'])

            if in_df is False:
                column_dict['alt splicing'].append(False)
                column_dict['type'].append('5prime')
                column_dict['peak'].append(False)

                # Get position, index and sequence for scoring and position code
                if strand == '+':
                    column_dict['position'].append(five_site + 1)
                    new_index.append(chrom + ':' + str(five_site + 1))
                    sequence1 = fa_dict[chrom][(five_site - 1):(five_site + 7)]
                    sequence2 = fa_dict[chrom][(three_site - 5):(three_site +
                                                                 3)]

                elif strand == '-':
                    column_dict['position'].append(five_site - 1)
                    new_index.append(chrom + ':' + str(five_site - 1))
                    sequence1 = fa_dict[chrom][(five_site - 6):(five_site + 2)]
                    sequence1 = SP.reverse_complement(sequence1)
                    sequence2 = fa_dict[chrom][(three_site - 2):(three_site +
                                                                 6)]
                    sequence2 = SP.reverse_complement(sequence2)

                column_dict['seq5'].append(sequence1)
                column_dict['seq3'].append(sequence2)

                # Score sequences
                score_5, score_3 = SP.simple_score_junction(
                    sequence1, sequence2, PSSM)
                column_dict['3p score'].append(score_3)
                column_dict['5p score'].append(score_5)

    # Create new dataframe from column dictionary
    new_df = pd.DataFrame(columns=column_dict.keys(), index=new_index)
    for column, data in column_dict.iteritems():
        new_df[column] = data

    return new_df
Пример #17
0
def build_junction_dict(junction_bed,
                        gff3_file,
                        transcript_dict,
                        organism=None):
    junction_dict = {}
    transcript_by_chr = {}
    unassigned_count = 0

    ss_dict, flag = SP.list_splice_sites(gff3_file, organism=organism)
    ss_by_gene = SP.collapse_ss_dict(ss_dict)
    ss_by_gene = {k: v for k, v in ss_by_gene.items() if len(v) > 0}

    for transcript, coords in transcript_dict.iteritems():
        if transcript[:-2] in ss_by_gene.keys():
            chromosome = coords[3]
            if chromosome in transcript_by_chr:
                transcript_by_chr[chromosome].append(transcript)
            else:
                transcript_by_chr[chromosome] = []
                transcript_by_chr[chromosome].append(transcript)

    a = 0
    with open(junction_bed, 'r') as fin:
        for line in fin:
            a += 1
            jct_transcript = None
            jct_type = 'Other'
            intron_num = None
            if line.startswith('c') or line.startswith('I'):
                columns = line.split('\t')
                lat_rom = {'I': 'chr1', 'II': 'chr2', 'III': 'chr3'}
                chromosome = columns[0]
                if chromosome in lat_rom:
                    chromosome = lat_rom[chromosome]

                if chromosome in transcript_by_chr:
                    transcript_list = transcript_by_chr[chromosome]

                strand = columns[5]
                if strand == '+':
                    jct_start = int(columns[1]) + int(
                        columns[10].split(',')[0]) - 1
                    jct_end = int(columns[2]) - int(
                        columns[10].split(',')[1]) - 1
                elif strand == '-':
                    jct_start = int(columns[2]) - int(
                        columns[10].split(',')[1])
                    jct_end = int(columns[1]) + int(columns[10].split(',')[0])
                depth = int(columns[4])
                size = abs(jct_end - jct_start)

                if depth >= 5:
                    assigned = False
                    for transcript in transcript_list:
                        if jct_start > transcript_dict[transcript][
                                0] and jct_end < transcript_dict[transcript][
                                    1] and strand == transcript_dict[
                                        transcript][2]:
                            assigned = True
                            jct_transcript = transcript
                            all_sites = zip(*ss_by_gene[transcript[:-2]])
                            try:
                                if jct_start in all_sites[
                                        0] and jct_end in all_sites[1]:
                                    jct_type = 'Annotated'
                                    ann_size = size
                                    intron_num = all_sites[0].index(
                                        jct_start) + 1
                                    ann_start = jct_start
                                    ann_stop = jct_end
                                    break
                                else:
                                    n = 0
                                    for intron in ss_by_gene[transcript[:-2]]:
                                        n += 1
                                        ann_size = None
                                        if strand == '+':
                                            if jct_start > intron[
                                                    0] and jct_end < intron[1]:
                                                ann_size = abs(intron[1] -
                                                               intron[0])
                                                jct_type = 'Nested'
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start >= intron[
                                                    0] and jct_end == intron[1]:
                                                ann_size = abs(intron[1] -
                                                               intron[0])
                                                jct_type = '3p tethered'
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start == intron[
                                                    0] and jct_end <= intron[1]:
                                                ann_size = abs(intron[1] -
                                                               intron[0])
                                                jct_type = '5p tethered'
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break

                                        elif strand == '-':
                                            if jct_start < intron[
                                                    0] and jct_end > intron[1]:
                                                jct_type = 'Nested'
                                                ann_size = intron[0] - intron[1]
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start <= intron[
                                                    0] and jct_end == intron[1]:
                                                jct_type = '5p tethered'
                                                ann_size = intron[0] - intron[1]
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                            elif jct_start == intron[
                                                    0] and jct_end >= intron[1]:
                                                jct_type = '3p tethered'
                                                ann_size = intron[0] - intron[1]
                                                ann_start = intron[0]
                                                ann_stop = intron[1]
                                                intron_num = n
                                                break
                                break
                            except IndexError:
                                print transcript
                    if assigned is False: unassigned_count += 1

                    try:
                        if jct_transcript != None:
                            if ann_size == None:
                                jct_type = "Other"
                                ann_size = 0
                                ann_start = None
                                ann_stop = None
                                intron_num = None
                            if (jct_transcript, ann_size) not in junction_dict:
                                junction_dict[(jct_transcript, ann_size)] = []
                            junction_dict[(jct_transcript, ann_size)].append([
                                chromosome, jct_start, jct_end, strand, depth,
                                jct_type, size, ann_size, ann_start, ann_stop
                            ])
                    except ValueError:
                        print jct_transcript
                        print jct_type

    print str(unassigned_count) + ' junctions not assigned to transcripts'
    return junction_dict
Пример #18
0
def generate_consensus_matrix(gff3, fasta_dict, PSSM=False):
    #Populate gene dictionary and build genome
    if 'pombe' in gff3.lower():
        transcript_dict = SP.build_transcript_dict(gff3, organism='pombe')
        ss, flag = SP.list_splice_sites(gff3, organism='pombe')
        organism = 'pombe'
    else:
        transcript_dict = SP.build_transcript_dict(gff3)
        ss, flag = SP.list_splice_sites(gff3)
        organism = None
    ss_dict = SP.collapse_ss_dict(ss)
    genome = fasta_dict
    #print genome.keys()
    nuc_prob = gc_content(fasta_dict)
    #print nuc_prob

    base_dict = {"A": 0, "C": 1, "T": 2, "G": 3}

    #First generate a consensus matrix for 5' and 3' splice site, where 1st row is A counts, second row is C, third row is T, fourth row is G.
    pos_matrix_5prime = np.zeros([4, 8])
    pos_matrix_3prime = np.zeros([4, 8])

    counter1 = 0
    counter2 = 0

    for transcript, introns in ss_dict.iteritems():
        counter2 += 1
        if organism == 'pombe':
            isoform = transcript + '.1'
        else:
            isoform = transcript + 'T0'
        strand = transcript_dict[isoform][2]
        chrom = transcript_dict[isoform][3]

        for intron in introns:
            counter1 += 1
            if strand == '+':
                seq = fasta_dict[chrom][(intron[0] - 1):(intron[0] + 7)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[0] - 6):(intron[0] + 2)]
                seq = SP.reverse_complement(seq)

            for a, base in enumerate(seq):
                pos_matrix_5prime[base_dict[base], a] += 1

            if strand == '+':
                seq = fasta_dict[chrom][(intron[1] - 5):(intron[1] + 3)]
            elif strand == '-':
                seq = fasta_dict[chrom][(intron[1] - 2):(intron[1] + 6)]
                seq = SP.reverse_complement(seq)

            for b, base in enumerate(seq):
                pos_matrix_3prime[base_dict[base], b] += 1

    #print counter1
    #print counter2

    float_formatter = lambda x: "%.1f" % x
    np.set_printoptions(formatter={'float_kind': float_formatter})

    a = 0
    while a < 4:
        b = 0
        while b < 8:
            if PSSM is False:
                pos_matrix_5prime[a,
                                  b] = (pos_matrix_5prime[a,
                                                          b]) / float(counter1)
                pos_matrix_3prime[a,
                                  b] = (pos_matrix_3prime[a,
                                                          b]) / float(counter1)
            if PSSM is True:
                if pos_matrix_5prime[a, b] == 0: pos_matrix_5prime[a, b] += 1
                if pos_matrix_3prime[a, b] == 0: pos_matrix_3prime[a, b] += 1
                pos_matrix_5prime[a, b] = np.log2(
                    (pos_matrix_5prime[a, b] / float(counter1)) / nuc_prob[a])
                pos_matrix_3prime[a, b] = np.log2(
                    (pos_matrix_3prime[a, b] / float(counter1)) / nuc_prob[a])
            b += 1
        a += 1

    return (pos_matrix_5prime, pos_matrix_3prime)