Exemplo n.º 1
0
 def _set_alignment_ranges(self):
     if not self.is_aligned():
         self._alignment_ranges = None
         return
     self._alignment_ranges = []
     cig = self.get_cigar()[:]
     target_pos = self.value('pos')
     query_pos = 1
     while len(cig) > 0:
         c = cig.pop(0)
         if re.match('[S]$', c[1]):  # hard or soft clipping
             query_pos += c[0]
         elif re.match('[ND]$', c[1]):  # deleted from reference
             target_pos += c[0]
         elif re.match('[I]$', c[1]):  # insertion to the reference
             query_pos += c[0]
         elif re.match('[MI=X]$', c[1]):  # keep it
             t_start = target_pos
             q_start = query_pos
             target_pos += c[0]
             query_pos += c[0]
             t_end = target_pos - 1
             q_end = query_pos - 1
             self._alignment_ranges.append([
                 GenomicRange(self.value('rname'), t_start, t_end),
                 GenomicRange(self.value('qname'), q_start, q_end)
             ])
     return
 def _initialize(self):  # Wait to initialize to speed up streaming
     if self._initialized: return  # nothing to do if its done
     self._initialized = True
     self._entry = _line_to_entry(self._line)
     self._exons = []
     self._junctions = []
     self._payload = []
     self._direction = self.value('strand')
     self._gene_name = self.value('gene_name')
     self._transcript_name = self.value('name')
     self._name = None
     for i in range(0, self.value('exonCount')):
         ex = Bio.Structure.Exon(
             GenomicRange(self.value('chrom'),
                          self.value('exonStarts')[i] + 1,
                          self.value('exonEnds')[i]))
         self._exons.append(ex)
     if self.value('exonCount') > 1:
         for i in range(0, self.value('exonCount') - 1):
             l = GenomicRange(self.value('chrom'),
                              self.value('exonEnds')[i],
                              self.value('exonEnds')[i])
             r = GenomicRange(self.value('chrom'),
                              self.value('exonStarts')[i + 1] + 1,
                              self.value('exonStarts')[i + 1] + 1)
             junc = Bio.Structure.Junction(l, r)
             junc.set_exon_left(self._exons[i])
             junc.set_exon_right(self._exons[i + 1])
             self._junctions.append(junc)
     self._sequence = None
Exemplo n.º 3
0
 def __init__(self, gpd_line):
     self._entry = self._line_to_entry(gpd_line)
     self._line = gpd_line.rstrip()
     self._range = None
     self.exons = []
     self.junctions = []
     self._payload = []
     self._direction = self.value('strand')
     self._gene_name = self.value('gene_name')
     self._transcript_name = self.value('name')
     self._name = None
     for i in range(0, self.value('exonCount')):
         ex = Bio.Structure.Exon(
             GenomicRange(self.value('chrom'),
                          self.value('exonStarts')[i] + 1,
                          self.value('exonEnds')[i]))
         self.exons.append(ex)
     if self.value('exonCount') > 1:
         for i in range(0, self.value('exonCount') - 1):
             l = GenomicRange(self.value('chrom'),
                              self.value('exonEnds')[i],
                              self.value('exonEnds')[i])
             r = GenomicRange(self.value('chrom'),
                              self.value('exonStarts')[i + 1] + 1,
                              self.value('exonStarts')[i + 1] + 1)
             junc = Bio.Structure.Junction(l, r)
             junc.set_exon_left(self.exons[i])
             junc.set_exon_right(self.exons[i + 1])
             self.junctions.append(junc)
     self._range = GenomicRange(self.value('chrom'),
                                self.value('exonStarts')[0] + 1,
                                self.value('exonEnds')[-1])
     self._id = str(uuid.uuid4())
     self._sequence = None
Exemplo n.º 4
0
 def _set_alignment_ranges(self):
   self._target_range = GenomicRange(self.value('tName'),self.value('tStart'),self.value('tEnd'))
   self._alignment_ranges = []
   for i in range(0,len(self.value('blockSizes'))):
     trng = GenomicRange(self.value('tName'),self.value('tStarts')[i]+1,self.value('tStarts')[i]+self.value('blockSizes')[i])
     qrng = GenomicRange(self.value('qName'),self.value('qStarts')[i]+1,self.value('qStarts')[i]+self.value('blockSizes')[i])
     self._alignment_ranges.append([trng,qrng])
   return
 def get_actual_query_range(self):
     a = self.get_alignment_ranges()
     #return GenomicRange(a[0][1].chr,a[0][1].start,a[-1][1].end,self.get_strand())
     if self.get_strand() == '+':
         return GenomicRange(a[0][1].chr, a[0][1].start, a[-1][1].end,
                             self.get_strand())
     #must be - strand
     return GenomicRange(a[0][1].chr,
                         self.get_query_length() - a[-1][1].end + 1,
                         self.get_query_length() - a[0][1].start + 1,
                         self.get_strand())
 def get_transcript(self, exon_bounds='max'):
     out = Transcript()
     out.junctions = [x.get_junction() for x in self.junction_groups]
     # check for single exon transcript
     if len(out.junctions) == 0:
         leftcoord = min([x.exons[0].rng.start for x in self.transcripts])
         rightcoord = max([x.exons[-1].rng.end for x in self.transcripts])
         e = Exon(GenomicRange(x.exons[0].rng.chr, leftcoord, rightcoord))
         e.set_is_leftmost()
         e.set_is_rightmost()
         out.exons.append(e)
         return out
     # get internal exons
     self.exons = []
     for i in range(0, len(self.junction_groups) - 1):
         j1 = self.junction_groups[i].get_junction()
         j2 = self.junction_groups[i + 1].get_junction()
         e = Exon(GenomicRange(j1.right.chr, j1.right.end, j2.left.start))
         e.set_left_junc(j1)
         e.set_right_junc(j2)
         #print str(i)+" to "+str(i+1)
         out.exons.append(e)
     # get left exon
     left_exons = [
         y for y in [
             self.transcripts[e[0]].junctions[e[1]].get_left_exon()
             for e in self.junction_groups[0].evidence
         ] if y
     ]
     if len(left_exons) == 0:
         sys.stderr.write("ERROR no left exon\n")
         sys.exit()
     e_left = Exon(GenomicRange(out.junctions[0].left.chr,\
                                min([x.get_range().start for x in left_exons]),
                                out.junctions[0].left.start))
     e_left.set_right_junc(out.junctions[0])
     out.exons.insert(0, e_left)
     # get right exon
     right_exons = [
         y for y in [
             self.transcripts[e[0]].junctions[e[1]].get_right_exon()
             for e in self.junction_groups[-1].evidence
         ] if y
     ]
     if len(right_exons) == 0:
         sys.stderr.write("ERROR no right exon\n")
         sys.exit()
     e_right = Exon(GenomicRange(out.junctions[-1].right.chr,\
                                out.junctions[-1].right.end,\
                                max([x.get_range().end for x in right_exons])))
     e_right.set_left_junc(out.junctions[-1])
     out.exons.append(e_right)
     return out
Exemplo n.º 7
0
 def __init__(self, index_file):
     self.index_file = index_file
     self._name_to_num = {}
     self._num_to_name = {}
     #self._ranges = []
     self._queries = {}
     self._chrs = {}
     self._unaligned = []
     self._lines = []
     self._coords = {}  # get the one indexed line number from coordinates
     inf = gzip.open(self.index_file)
     z = 0
     linenum = 0
     for line in inf:
         f = line.rstrip("\n").split("\t")
         name = f[0]
         num = None
         if name not in self._num_to_name:
             self._num_to_name[z] = name
             self._name_to_num[name] = z
             num = z
             z += 1
         else:
             num = self._name_to_num[name]
         coord = [num, int(f[2]), int(f[3])]
         rng = None
         if f[1] != '':
             rng = GenomicRange(range_string=f[1])
             rng.set_payload(coord)
             #self._ranges.append(rng)
         if num not in self._queries:
             self._queries[num] = []
         self._queries[num].append(linenum)
         #coord+[rng,int(f[4])])
         self._lines.append({
             'qname': f[0],
             'rng': rng,
             'filestart': int(f[2]),
             'innerstart': int(f[3]),
             'basecount': int(f[4]),
             'flag': int(f[5])
         })
         if int(f[2]) not in self._coords: self._coords[int(f[2])] = {}
         linenum += 1
         self._coords[int(f[2])][int(f[3])] = linenum
         if rng:
             if rng.chr not in self._chrs:
                 self._chrs[rng.chr] = []
             self._chrs[rng.chr].append(linenum)
         else:
             self._unaligned.append(linenum)
     inf.close()
     return
 def get_range(self):
     self._initialize()
     if self._range:
         return self._range
     return GenomicRange(self.exons[0].get_range().chr,
                         self.exons[0].get_range().start,
                         self.exons[-1].get_range().end)
 def set_range(self):
     self._initialize()
     if len(self.exons) == 0: return None  # its ... nothing
     chrs = list(set([x.rng.chr for x in self.exons]))
     if len(chrs) > 1: return None  # its chimeric
     self._range = GenomicRange(chrs[0], self.exons[0].rng.start,
                                self.exons[-1].rng.end)
 def get_junction(self1):  # return the consensus junction
     if self1.representative_junction:
         return self1.representative_junction
     left_rngs = []
     right_rngs = []
     for j in [
             self1.outer.transcripts[x[0]].junctions[x[1]]
             for x in self1.evidence
     ]:
         left_rngs.append(j.left)
         right_rngs.append(j.right)
     left = _mode([x.end for x in left_rngs])
     right = _mode([x.start for x in right_rngs])
     outj = Junction(GenomicRange(left_rngs[0].chr, left, left),
                     GenomicRange(right_rngs[0].chr, right, right))
     self1.representative_junction = outj
     return outj
 def __init__(self, gpd_line):
     # Only store the line and ID at first.
     self._line = gpd_line.rstrip()
     self._id = str(uuid.uuid4())
     m = re.match('[^\t]+\t[^\t]+\t([^\t]+)\t[^\t]+\t([^\t]+)\t([^\t]+)',
                  gpd_line)
     self._range = GenomicRange(m.group(1),
                                int(m.group(2)) + 1, int(m.group(3)))
     self._initialized = False
Exemplo n.º 12
0
 def get_target_range(self):
     if not self.is_aligned(): return None
     if self._target_range: return self._target_range
     global _sam_cigar_target_add
     tlen = sum([
         x[0] for x in self.get_cigar() if _sam_cigar_target_add.match(x[1])
     ])
     self._target_range = GenomicRange(self.value('rname'),
                                       self.value('pos'),
                                       self.value('pos') + tlen - 1)
     return self._target_range
 def set_exons_and_junctions_from_ranges(self, rngs):
     self._initialize()
     self.exons = []
     self.junctions = []
     for e in rngs:
         ex = Exon(GenomicRange(e.chr, e.start, e.end))
         self.exons.append(ex)
     self.exons[0].set_is_leftmost()
     self.exons[-1].set_is_rightmost()
     for i in range(0, len(self.exons) - 1):
         # make a junction
         jx = Junction(GenomicRange(self.exons[i].rng.chr,\
                                    self.exons[i].rng.end,\
                                    self.exons[i].rng.end),\
                       GenomicRange(self.exons[i+1].rng.chr,\
                                    self.exons[i].rng.start,\
                                    self.exons[i+1].rng.start))
         jx.set_exon_left(self.exons[i])
         jx.set_exon_right(self.exons[i + 1])
         self.junctions.append(jx)
         self.set_range()
     return
Exemplo n.º 14
0
 def get_actual_original_query_range(self):
     l = self.get_original_query_length()
     a = self.get_alignment_ranges()
     qname = a[0][1].chr
     qstart = a[0][1].start
     qend = a[-1][1].end
     #rng = self.get_query_range()
     start = qstart
     end = qend
     if self.get_strand() == '-':
         end = l - (qstart - 1)
         start = 1 + l - (qend)
     return GenomicRange(qname, start, end, self.get_strand())
Exemplo n.º 15
0
def main(args):

    sys.stderr.write("Reading in reference genePred\n")
    refgpd = {}
    inf = open(args.ref_genepred)
    gs = GPDStream(inf)
    z = 0
    for gpd in gs:
        z += 1
        refgpd[z] = gpd
    inf.close()

    sys.stderr.write("Reading in read annotations\n")
    inf = None
    if is_gzip(args.annotations):
        inf = gzip.open(args.annotations)
    else:
        inf = open(args.annotations)
    reflocs = {}
    rline = {}
    for line in inf:
        f = line.rstrip().split("\t")
        res={'read_line':int(f[0]),\
        'read_name':f[1],\
        'gene_name':f[2],\
        'tx_name':f[3],\
        'type':f[4],\
        'matching_exon_count':int(f[5]),\
        'consecutive_exons':int(f[6]),\
        'read_exons':int(f[7]),\
        'tx_exons':int(f[8]),\
        'overlap':int(f[9]),\
        'read_length':int(f[10]),\
        'tx_length':int(f[11]),\
        'read_range':GenomicRange(range_string=f[12]),\
        'tx_range':GenomicRange(range_string=f[13]),\
        'ref_line':int(f[14])}
        if res['ref_line'] not in reflocs: reflocs[res['ref_line']] = []
        reflocs[res['ref_line']].append(res)
        if args.full and res['type'] != 'full': continue
        if args.minimum_matched_exons > res['matching_exon_count']: continue
        rline[res['read_line']] = res
    inf.close()

    sys.stderr.write("reading read genepred\n")
    inf = None
    if is_gzip(args.read_genepred):
        inf = gzip.open(args.read_genepred)
    else:
        inf = open(args.read_genepred)
    gs = GPDStream(inf)
    z = 0
    originals = {}
    for gpd in gs:
        z += 1
        if z not in rline: continue
        refline = rline[z]['ref_line']
        if refline not in originals: originals[refline] = {}
        originals[refline][z] = gpd
    inf.close()
    results = {}
    for i in range(1, 101):
        results[str(i)] = []
    read_total = 0
    outs = {}
    for tx_line in originals:
        ref_gpd = refgpd[tx_line]
        annots = reflocs[tx_line]
        reads = originals[tx_line].values()
        v = do_tx_line(ref_gpd, annots, reads, args)
        if not v: continue
        tname = ref_gpd.get_transcript_name()
        bins = sorted([int(x) for x in v[0].keys()])
        outs[tname] = [0 for x in range(1, 101)]
        read_total += v[1]
        for i in range(1, 101):
            if str(i) in v[0]:
                results[str(i)].append(v[0][str(i)])
                outs[tname][i - 1] = v[0][str(i)]
            #else:
            #  results[str(i)].append(0)
    of = sys.stdout
    if args.output and re.search('\.gz', args.output):
        of = gzip.open(args.output, 'w')
    elif args.output:
        of = open(args.output, 'w')
    tot = len(outs.keys())
    #for i in range(1,101):
    #  ostr = str(i)
    #  tot = len(results[str(i)])
    #  for j in results[str(i)]:
    #    ostr += "\t"+str(j)
    #  of.write(ostr+"\n")
    for tname in outs:
        of.write(tname + "\t" + "\t".join([str(x)
                                           for x in outs[tname]]) + "\n")
    of.close()
    if args.output_counts:
        of = open(args.output_counts, 'w')
        of.write(str(tot) + "\t" + str(read_total) + "\n")
        of.close()
    sys.stderr.write(
        str(tot) + " total transcripts \t" + str(read_total) +
        " total reads\n")
 def get_target_range(self):
     a = self.get_alignment_ranges()
     return GenomicRange(a[0][0].chr, a[0][0].start, a[-1][0].end)
 def get_range(self):
     chrs = set([x.get_range().chr for x in self.get_transcripts()])
     if len(chrs) != 1: return None
     start = min([x.get_range().start for x in self.get_transcripts()])
     end = max([x.get_range().end for x in self.get_transcripts()])
     return GenomicRange(list(chrs)[0], start, end)