示例#1
0
 def setUp(self):
     iv = IntervalNode(Interval(50, 59))
     for i in range(0, 110, 10):
         if i == 50: continue
         f = Interval(i, i + 9)
         iv = iv.insert(f)
     self.intervals = iv
示例#2
0
    def test_max_dist(self):
        iv = self.intervals
        r = iv.right(Interval(1, 1), max_dist=0, n=10)
        self.assertEqual(len(r), 0)

        for n, d in enumerate(range(10, 1000, 10)):
            r = iv.right(Interval(1, 1), max_dist=d, n=10000)
            self.assertEqual(len(r), n + 1)
示例#3
0
    def test_count(self):
        iv = self.intervals

        r = iv.right(Interval(1, 1), n=33)
        self.assertEqual(len(r), 33)

        l = iv.left(Interval(1, 1), n=33)
        self.assertEqual(len(l), 1)
示例#4
0
    def setUp(self):
        iv = IntervalNode(Interval(1, 2))
        self.max = 1000000
        for i in range(0, self.max, 10):
            f = Interval(i, i)
            iv = iv.insert(f)

        for i in range(6000):
            iv = iv.insert(Interval(0, 1))
        self.intervals = iv
示例#5
0
    def test_left(self):
        iv = self.intervals
        self.assertEqual(str(iv.left(Interval(60, 70), n=2)),
                         str([Interval(50, 59),
                              Interval(40, 49)]))

        for i in range(10, 100, 10):
            f = Interval(i, i)
            r = iv.left(f, max_dist=10, n=1)
            self.assertEqual(r[0].end, i - 1)
示例#6
0
 def test_n(self):
     iv = self.intervals
     for i in range(0, 90, 10):
         f = Interval(i + 1, i + 1)
         r = iv.right(f, max_dist=20, n=2)
         self.assertEqual(r[0].start, i + 10)
         self.assertEqual(r[1].start, i + 20)
示例#7
0
    def test_right(self):
        iv = self.intervals
        self.assertEqual(str(iv.left(Interval(60, 70), n=2)),
                         str([Interval(50, 59),
                              Interval(40, 49)]))

        def get_right_start(b10):
            r = iv.right(Interval(b10, b10 + 1), n=1)
            assert len(r) == 1
            return r[0].start

        for i in range(10, 100, 10):
            self.assertEqual(get_right_start(i), i + 10)

        for i in range(0, 100, 10):
            f = Interval(i - 1, i - 1)
            r = iv.right(f, max_dist=10, n=1)
            self.assertEqual(r[0].start, i)
示例#8
0
    def test_tree_pickle(self):
        a = IntervalTree()
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                a.insert(f)

        a.dump('a.pkl')

        b = IntervalTree()
        b.load('a.pkl')
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                af = sorted(a.find(f), key=operator.attrgetter('start'))
                bf = sorted(b.find(f), key=operator.attrgetter('start'))

                assert len(bf) > 0
                self.assertEqual(len(af), len(bf))
                self.assertEqual(af[0].start, bf[0].start)
                self.assertEqual(af[-1].start, bf[-1].start)
示例#9
0
 def annotation(self,
                annotation_type,
                start=None,
                end=None) -> typing.List[Annotation]:
     try:
         if end is None or start is None:
             anno_iter = self._annotations.find(Interval(0, self.end))
         else:
             anno_iter = filter(
                 lambda x: x.data.overlaps(Span(start, end)),
                 self._annotations.find(Interval(start, end)))
     except:
         return []
     if annotation_type:
         annotation_type = annotation_type.lower()
         return sorted([
             x.data for x in anno_iter
             if x.data.annotation_type.lower() == annotation_type
             and x.data != self
         ])
     return sorted([x.data for x in anno_iter if x.data != self])
示例#10
0
def parse_ribotricer_index(ribotricer_index):
    """
    Parse ribotricer index to get only 'annotated'
    features.

    Parameters
    ----------
    ribotricer_index: str
                   Path to the index file generated by ribotricer prepare_orfs

    Returns
    -------
    annotated: List[ORF]
               ORFs of CDS annotated
    novel: List[ORF]
           list of non-annotated ORFs
    refseq: defaultdict(IntervalTree)
            chrom: (start, end, strand)
    """

    annotated = []
    refseq = defaultdict(IntervalTree)

    # First count the number of
    # annotated regions to count.
    # The annotated regions appear first in the index file
    # so need to read only upto a point where the regions
    # no longer have the annotated tag.
    total_lines = 0
    with open(ribotricer_index, "r") as anno:
        # read header
        anno.readline()
        while "annotated" in anno.readline():
            total_lines += 1
    with open(ribotricer_index, "r") as anno:
        with tqdm(total=total_lines, unit="lines", leave=False) as pbar:
            # read header
            anno.readline()
            line = anno.readline()
            while "annotated" in line:
                pbar.update()
                orf = ORF.from_string(line)
                if orf is not None and orf.category == "annotated":
                    refseq[orf.chrom].insert(
                        Interval(
                            orf.intervals[0].start,
                            orf.intervals[-1].end,
                            STRAND_TO_NUM[orf.strand],
                        ))
                    annotated.append(orf)
                line = anno.readline()
    return (annotated, refseq)
示例#11
0
    def setUp(self):
        intervals = []
        for i in range(11, 20000, 15):
            for zz in range(random.randint(2, 5)):
                m = random.randint(1, 10)
                p = random.randint(1, 10)
                intervals.append(Interval(i - m, i + p))
        iv = IntervalNode(intervals[0])
        for f in intervals[1:]:
            iv = iv.insert(f)

        self.intervals = intervals
        self.tree = iv
示例#12
0
 def create_annotation(self,
                       type: str,
                       start: int,
                       end: int,
                       attributes=None) -> Annotation:
     if attributes is None:
         attributes = []
     annotation = Annotation(self, start, end, type, attributes,
                             self._next_id)
     self._next_id += 1
     self._annotations.insert(
         Interval(annotation.start, annotation.end, annotation))
     self._aid_dict[annotation.annotation_id] = annotation
     return annotation
示例#13
0
 def setUp(self):
     tpath = os.path.dirname(__file__)
     self.fa = os.path.join(tpath, 'test.fa')
     self.fai = os.path.join(tpath, 'test.fa.fai')
     self.gff3_1 = os.path.join(tpath, 'test_1.gff3')
     self.gff3_2 = os.path.join(tpath, 'test_2.gff3')
     self.A = np.array([[1, 1, 0, 0], [1, 1, 1, 1], [0, 1, 1, 0]])
     self.i5 = (Interval(0, 10), Interval(5, 15))
     self.i0 = (Interval(0, 10), Interval(10, 15))
     self.i3 = (Interval(0, 9), Interval(0, 3))
示例#14
0
    def test_left(self):
        max_dist = 200
        n = 15
        iv = self.tree
        for i in range(11, 20000, 25):
            for zz in range(random.randint(2, 5)):
                s1 = random.randint(i + 1, i + 20)
                f = Interval(s1, s1)

                bf = brute_force_find_left(self.intervals, f, max_dist, n)
                tf = iv.left(f, max_dist=max_dist, n=n)
                if len(tf) == 0:
                    assert len(bf) == 0, bf
                    continue

                mdist = max(distance(f, t) for t in tf)
                self.assertTrue(set(bf).issuperset(tf))
                diff = set(bf).difference(tf)
                self.assertTrue(len(diff) == 0, (diff))
def infer_protocol(bam, gene_interval_tree, prefix, n_reads=20000):
    """Infer strandedness protocol given a bam file

    Parameters
    ----------
    bam: str
         Path to bam file
    gene_interval_tree: defaultdict(IntervalTree)
            chrom: (start, end, strand)
    prefix: str
            Prefix for protocol file
    n_reads: int
             Number of reads to use (downsampled)

    Returns
    -------
    protocol: string
              forward/reverse
    
    The strategy to do this is simple: keep a track
    of mapped reads and their strand and then tally 
    if the location of their mapping has a gene defined
    on the positive strand or the negative strand.

    If the first and second characters denote the mapping and
    gene strand respectively:
    Higher proportion of (++, --) implies forward protocol
    Higher proportion of (+-, -+) implies reverse protocol
    Equal proportion of the above two scenairos implies unstranded protocol.
    
    """
    iteration = 0
    bam = pysam.AlignmentFile(bam, "rb")
    strandedness = Counter()
    for read in bam.fetch(until_eof=True):
        if iteration <= n_reads:
            if is_read_uniq_mapping(read):
                if read.is_reverse:
                    mapped_strand = "-"
                else:
                    mapped_strand = "+"
                mapped_start = read.reference_start
                mapped_end = read.reference_end
                chrom = read.reference_name
                # get corresponding gene's strand
                interval = list(
                    set(gene_interval_tree[chrom].find(
                        Interval(mapped_start, mapped_end))))
                if len(interval) == 1:
                    # Filter out genes with ambiguous strand info
                    # (those) that have a tx_start on opposite strands
                    gene_strand = NUM_TO_STRAND[interval[0].data]
                    # count table for mapped strand vs gene strand
                    strandedness["{}{}".format(mapped_strand,
                                               gene_strand)] += 1
                    iteration += 1
    # Add pseudocounts
    strandedness["++"] += 1
    strandedness["--"] += 1
    strandedness["+-"] += 1
    strandedness["-+"] += 1

    total = sum(strandedness.values())
    forward_mapped_reads = strandedness["++"] + strandedness["--"]
    reverse_mapped_reads = strandedness["-+"] + strandedness["+-"]
    to_write = (
        "In total {} reads checked:\n"
        '\tNumber of reads explained by "++, --": {} ({:.4f})\n'
        '\tNumber of reads explained by "+-, -+": {} ({:.4f})\n').format(
            total,
            forward_mapped_reads,
            forward_mapped_reads / total,
            reverse_mapped_reads,
            reverse_mapped_reads / total,
        )
    with open("{}_protocol.txt".format(prefix), "w") as output:
        output.write(to_write)
    protocol = "forward"
    if reverse_mapped_reads > forward_mapped_reads:
        protocol = "reverse"
    return protocol
示例#16
0
 def test_find(self):
     self.tree.find(Interval(46, 47))
示例#17
0
 def get_right_start(b10):
     r = iv.right(Interval(b10, b10 + 1), n=1)
     assert len(r) == 1
     return r[0].start
示例#18
0
 def test_feature_pickle(self):
     f = Interval(22, 38, data={'a': 22})
     g = loads(dumps(f))
     self.assertEqual(f.start, g.start)
     self.assertEqual(g.data['a'], 22)
示例#19
0
 def test_left(self):
     self.tree.left(Interval(46, 47))
示例#20
0
 def test_toomany(self):
     iv = self.intervals
     self.assertEqual(len(iv.left(Interval(60, 70), n=200)), 6)
示例#21
0
 def test_right(self):
     self.tree.right(Interval(46, 47))
示例#22
0
 def test_left(self):
     self.assertEqual(2, len(self.tree4.left(Interval(44, 55))))
     self.assertEqual(0, len(self.tree4.left(Interval(11, 12))))
示例#23
0
 def setUp(self):
     self.tree4 = IntervalTree()
     self.tree4.insert(Interval(22, 33, data='example1'))
     self.tree4.insert(Interval(22, 33, data='example2'))