forked from ctokheim/PrimerSeq
/
read_counts.py
executable file
·115 lines (100 loc) · 5.35 KB
/
read_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import splice_graph as sg
from exon_seek import ExonSeek
import algorithms as algs
import utils
import logging
def save_isforms_and_counts(line, options):
# get information about each row
ID, target_coordinate = line[:2]
strand = target_coordinate[0]
chr = utils.get_chr(target_coordinate[1:])
tmp_start, tmp_end = utils.get_pos(target_coordinate)
logging.debug('Saving isoform and count information for event %s . . .' % ID)
# get information from GTF annotation
gene_dict, gene_name = retrieve_gene_information(options,
strand, chr, tmp_start, tmp_end)
# get edge weights
edge_weights_list = [sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end'])
for sam_obj in options['rnaseq']]
# construct splice graph for each BAM file
bam_splice_graphs = sg.construct_splice_graph(edge_weights_list,
gene_dict,
chr,
strand,
options['read_threshold'],
options['min_jct_count'],
output_type='list',
both=options['both_flag'])
for bam_ix, my_splice_graph in enumerate(bam_splice_graphs):
# this case is meant for user-defined flanking exons
if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1':
# find path and count information
paths, counts = user_defined_exons(my_splice_graph, line)
# filter out single exon paths
# my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1]
# paths, counts = zip(*my_tmp)
# this case is meant for automatic choice of flanking exons
else:
paths, counts = primerseq_defined_exons(my_splice_graph, line, options['psi'])
utils.save_path_info('%s.%d' % (ID, bam_ix),
paths, counts,
save_dir='tmp/indiv_isoforms/')
logging.debug('Finished saving isoform and count information for event %s.' % ID)
def user_defined_exons(tmp_sg, line):
chr, strand = utils.get_chr(line[utils.TARGET]), line[utils.TARGET][0] # get chr and strand
upstream_exon = utils.get_pos(line[utils.UPSTREAM_EXON]) # get user-defined flanking exons
downstream_exon = utils.get_pos(line[utils.DOWNSTREAM_EXON])
first_primer, second_primer = utils.get_primer_coordinates(line[utils.PRIMER_COORD])
# get possible exons for primer amplification
tmp = sorted(tmp_sg.get_graph().nodes(), key=lambda x: (x[0], x[1]))
first_ex = utils.find_first_exon(first_primer, tmp)
last_ex = utils.find_last_exon(second_primer, tmp)
my_exons = tmp[first_ex:last_ex + 1]
# if tmp_sg.strand == '+':
# my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1]
# else:
# my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1]
# Use correct tx's and estimate counts/psi
all_paths = algs.AllPaths(tmp_sg,
my_exons,
utils.get_pos(line[utils.TARGET]), # tuple (start, end)
chr=chr,
strand=strand)
# all_paths.trim_tx_paths()
fexon = upstream_exon if strand == "+" else downstream_exon
lexon = downstream_exon if strand == "+" else upstream_exon
all_paths.trim_tx_paths_using_primers(first_primer, second_primer, fexon, lexon)
all_paths.set_all_path_coordinates()
paths, counts = all_paths.estimate_counts() # run EM algorithm
return paths, counts
def primerseq_defined_exons(tmp_sg, line, psi_option):
"""
Get information about counts and paths if using PrimerSeq to define the flanking exons.
"""
# not the best use of the ExonSeek object, initially intended to find appropriate flanking exons
# but in this case ExonSeek is used to get the transcripts and associate counts
ID = line[utils.ID]
tgt_pos = utils.get_pos(line[utils.TARGET])
exon_seek_obj = ExonSeek(tgt_pos,
tmp_sg,
ID,
psi_option,
None, # no defined upstream exon
None) # no defined downstream exon
all_paths, upstream, downstream, component, psi_target, psi_upstream, psi_downstream = exon_seek_obj.get_info()
return exon_seek_obj.paths, exon_seek_obj.counts
def retrieve_gene_information(options, strand, chr, start, end):
"""
Gets information from GTF annotation either by using gene name
or by using weakly connected transcripts
"""
# get information regarding the gene
if options['no_gene_id']:
# hopefully filter out junk, but only uses weakly connected
gene_dict, gene_name = sg.get_weakly_connected_tx(options['gtf'],
strand, chr, start, end)
else:
# gets everything for a single gene
gene_dict, gene_name = sg.get_from_gtf_using_gene_name(options['gtf'],
strand, chr, start, end)
return gene_dict, gene_name