def setUpClass(cls):
     cls.cols = {
         "intA":
         numpy.random.randint(0, high=2**16, size=size),
         "intB":
         numpy.random.randint(-10, high=20, size=size),
         "idxA":
         numpy.arange(size),
         "chrA":
         numpy.array([chr(65 + (X % (91 - 65))) for X in range(size)]),
         "strA":
         numpy.array([
             str(GenomicSegment("chrA", X, X + 500, "+"))
             for X in range(size)
         ]),
         "strB":
         numpy.array([
             str(GenomicSegment("chrB", X / 2, X / 2 + 500, "-"))
             for X in range(size)
         ]),
         "floatA":
         10 * numpy.random.randn(size) + 500,
         "floatB": (10**-5) * numpy.random.random(size),
         "objA":
         numpy.tile(None, 5000),
         "objB":
         numpy.array([
             GenomicSegment("chrC", X, X + Y, "+") for X, Y in zip(
                 range(size), numpy.random.randint(2, high=1000, size=size))
         ]),
     }
Пример #2
0
 def setUpClass(cls):
     cls.ivs = [
         GenomicSegment("chrA", 100, 190, "+"),
         GenomicSegment("chrA", 200, 203, "+"),
         GenomicSegment("chrA", 200, 201, "+"),
         GenomicSegment("chrA", 204, 206, "+"),
     ]
     cls.common_attr = dict(common1="common",
                            common2="also common",
                            common3="still common")
     cls.attrs = [
         dict(common_diff_val="unique_f1", unique_f1_key="something"),
         dict(common_diff_val="unique_f2",
              unique_f2_key="something",
              unique_f2f3="something else"),
         dict(common_diff_val="unique_f3",
              unique_f3_key="something",
              unique_f2f3="something else",
              unique_f3f4="f3 only"),
         dict(common_diff_val="unique_f4",
              unique_f4_key="something",
              unique_f3f4="f4 only"),
     ]
     for x in cls.attrs:
         x.update(cls.common_attr)
Пример #3
0
def test_window_landmark():
    # test cases: plus and minus-strand IVCs with splicing
    flank_up = 50
    flank_down = 100
    my_segmentchains = [
        SegmentChain(GenomicSegment("chrA", 50, 350, "+"),
                     GenomicSegment("chrA", 500, 900, "+")),
        SegmentChain(GenomicSegment("chrA", 50, 350, "-"),
                     GenomicSegment("chrA", 500, 900, "-")),
    ]
    for my_segmentchain in my_segmentchains:
        for landmark in range(0, 700, 50):
            yield check_window_landmark, my_segmentchain, landmark, flank_up, flank_down
Пример #4
0
    def setUpClass(cls):
        min_ = 25
        max_ = 40
        cls.strands  = ("+","-")
        cls.segs = { X : GenomicSegment("mock",0,2000,X) for X in cls.strands }
        
        cls.reads = { Y : [cls.make_alignment(0,X,Y) for X in range(min_,max_)] for Y in ("+","-") }
        cls.expected = {}
        for mapping in ("fiveprime","threeprime","center"):
            for param in (0,10):
                for strand in cls.strands:
                    cls.expected[(mapping,param,strand)] = numpy.zeros(2000)

        cls.expected[("fiveprime",0, "+")][0]  = max_ - min_
        cls.expected[("fiveprime",10,"+")][10] = max_ - min_
        cls.expected[("fiveprime",0, "-")][min_-1:max_-1]   = 1
        cls.expected[("fiveprime",10,"-")][min_-11:max_-11] = 1

        cls.expected[("threeprime",0, "-")][0]  = max_ - min_
        cls.expected[("threeprime",10,"-")][10] = max_ - min_
        cls.expected[("threeprime",0, "+")][min_-1:max_-1]   = 1
        cls.expected[("threeprime",10,"+")][min_-11:max_-11] = 1

        for my_len in range(min_,max_):
            cls.expected[("center",0,"+")][:my_len] += 1.0/my_len
            cls.expected[("center",0,"-")][:my_len] += 1.0/my_len
            cls.expected[("center",10,"+")][10:my_len-10] += 1.0/(my_len-2*10)
            cls.expected[("center",10,"-")][10:my_len-10] += 1.0/(my_len-2*10)

        cls.map_factories = {
            "fiveprime"          : FivePrimeMapFactory,
            "threeprime"         : ThreePrimeMapFactory,
            "fiveprime_variable" : VariableFivePrimeMapFactory,
            "center"             : CenterMapFactory
        }
Пример #5
0
    def check_random_windows_against_wig(self, strand):
        chrdict = self.chrdict
        chroms = list(self.chrdict)
        chridx = numpy.random.randint(0, high=len(chroms), size=50)
        ga = GenomeArray()

        i = 0

        with open(wigfile) as fin:
            ga.add_from_wiggle(fin, strand)
            while i < 50:
                chrom = chroms[chridx[i]]
                maxlength = chrdict[chrom]
                start = numpy.random.randint(0, high=maxlength - 2000)
                end = numpy.random.randint(start + 10000, high=start + 20000)

                # make sure we don't go off chrom
                while end > maxlength:
                    end = numpy.random.randint(start + 100, high=start + 10000)

                seg = GenomicSegment(chrom, start, end, strand)
                expected = ga[seg]
                # make sure segment has counts in it
                if expected.sum() > 0:
                    i += 1
                    found = self.bw[seg]
                    yield self.check_vals_against_wig, expected, found
Пример #6
0
def covered_by_repetitive(query_junc,minus_range,plus_range,cross_hash):
    """Determine whether one or both ends of a splice site overlap with
    a repetitive area of the genome.
    
    Parameters
    ----------
    query_junc : |SegmentChain|
         A two-exon fragment representing a query splice junction
    
    minus_range : int <= 0
        Maximum number of nucleotides splice junction could be moved 
        to the left without reducing sequence support for the junction
        see :py:func:`find_match_range`
        
    plus_range : int >= 0
        Maximum number of nucleotides splice junction could be moved 
        to the right without reducing sequence support for the junction
        see :py:func:`find_match_range`
    
    cross_hash : |GenomeHash|
        |GenomeHash| of 1-length features denoting repetitive regions of the genome
        
    
    Returns
    -------
    bool
        `True` if any of the genomic positions within `minus_range...plus_range`
        of the 5' or 3' splice sites of `query_junc` overlap a repetitive
        region of the genome as annotated by ``cross_hash``.
        Otherwise, `False`
    """
    chrom = query_junc.spanning_segment.chrom
    strand = query_junc.spanning_segment.strand
    qend = query_junc[0].end
    qstart = query_junc[1].start
    fiveprime_splice_area = GenomicSegment(chrom,
                                           qend + minus_range,
                                           qend + plus_range + 1,
                                           strand)
    threeprime_splice_area = GenomicSegment(chrom,
                                            qstart + minus_range,
                                            qstart + plus_range + 1,
                                            strand)
    support_region = SegmentChain(fiveprime_splice_area,threeprime_splice_area)
    return len(cross_hash.get_overlapping_features(support_region)) > 0
Пример #7
0
 def test_get_chromosome_counts_zero_fill(self):
     ga = GenomeArray()
     with open(wigfile) as fin:
         ga.add_from_wiggle(fin, "+")
         for chrom, length in self.chrdict.items():
             seg = GenomicSegment(chrom, 0, length, "+")
             expected = ga[seg]
             found = self.bw.get_chromosome_counts(chrom)
             yield self.check_vals_against_wig, expected, found
Пример #8
0
    def test_variable_stratified_mapping_plus(self):
        offsets = {
            26 : 6,
            27 : 22,
            28 : 13,
            29 : 4,
            30 : 5
        }

        chains = {
            "fw" : SegmentChain(GenomicSegment('chrII',392959,393180,'+'),
                              GenomicSegment('chrII',393510,394742,'+'),
                              GenomicSegment('chrII',394860,394901,'+'),
                              ID='YBR078W_mRNA'),
            "rc" : SegmentChain(GenomicSegment('chrVIII',189061,189749,'-'),
                              GenomicSegment('chrVIII',189850,190017,'-'),
                              ID='YHR041C_mRNA')
        }
        expected = {
            "fw" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_fw_vec.txt"),delimiter="\t"),
            "rc" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_rc_vec.txt"),delimiter="\t"),
        }
        ga = BAMGenomeArray([resource_filename("plastid","test/data/stratmap/strat.bam")])
        ga.set_mapping(StratifiedVariableFivePrimeMapFactory(offsets,26,30))
Пример #9
0
 def filter(self,line):
     """Parse a read alignment as |SegmentChain| from a line of `bowtie`_ output"""
     items = line.strip("\n").split("\t")
     read_name      = items[0]
     strand         = items[1]
     ref_seq        = items[2]
     coord          = int(items[3])
     attr = { 'seq_as_aligned' : items[4],
              'qualstr'        : items[5],
              'mismatch_str'   : items[7],
              'type'           : "alignment",
              'ID'             : read_name,
            }
     
     iv = GenomicSegment(ref_seq,coord,coord+len(attr['seq_as_aligned']),strand)
     feature = SegmentChain(iv,**attr)
     return feature
Пример #10
0
    def test_fill_val_present_chrom(self):
        filldef = BigWigReader(bigwigfile)
        fillnan = BigWigReader(bigwigfile, fill=numpy.nan)
        fill0 = BigWigReader(bigwigfile, fill=0)
        fill10 = BigWigReader(bigwigfile, fill=10)

        # empty region
        seg = GenomicSegment("chrIV", 5, 10, "+")

        assert_equal(len(filldef[seg]), len(seg), "fetched wrong size")

        #         assert_true(numpy.isnan(filldef[seg]).all(),
        #                     "default not nan")
        #
        #         assert_true(numpy.isnan(fillnan[seg]).all(),
        #                     "nanfill didn't work")

        assert_true((fill0[seg] == 0).all(), "0-fill didn't work")
Пример #11
0
    def test_fill_val_absent_chrom(self):
        filldef = BigWigReader(bigwigfile)
        fillnan = BigWigReader(bigwigfile, fill=numpy.nan)
        fill0 = BigWigReader(bigwigfile, fill=0)
        fill10 = BigWigReader(bigwigfile, fill=10)

        # chrVI is not in dataset; this should be an empty array
        seg = GenomicSegment("chrVI", 5, 1000, "+")

        assert_equal(len(filldef[seg]), len(seg), "fetched wrong size")

        #         assert_true(numpy.isnan(filldef[seg]).all(),
        #                     "default not nan")
        #
        #         assert_true(numpy.isnan(fillnan[seg]).all(),
        #                     "nanfill didn't work")

        assert_true((fill0[seg] == 0).all(), "0-fill didn't work")
Пример #12
0
def revcomp_mask_chain(seg, k, offset=0):
    """Reverse-complement a single-interval mask, correcting for `offset`.
    
    Parameters
    ----------
    seg : |SegmentChain|
        Plus-strand mask, including `offset`

    k : int
        Length of k-mers

    offset : int, optional
        Offset from 5' end of read at which to map mask (Default: `0`)

    Returns
    -------
    |SegmentChain|
        Mask on minus strand corresponding to `seg`
    """
    # Algorithm note:
    #
    #     Let
    #         FW = plus-strand coordinate
    #         RC = minus-strand coordinate
    #
    #     Then
    #         RC = FW + k - 1 - offset
    #
    #     But we are given FW + offset, so:
    #
    #         RC + offset = (FW + offset) + k - 1 - offset
    #         RC = (FW + offset) + k - 1 - 2*offset
    span = seg.spanning_segment
    new_offset = k - 1 - 2 * offset
    ivminus = GenomicSegment(span.chrom, span.start + new_offset,
                             span.end + new_offset, "-")
    return SegmentChain(ivminus)
Пример #13
0
 def test_search_fields_multivalue(self):
     reader = BigBedReader(self.bb_indexed)
     found = list(
         reader.search("name", "should_have_no_match",
                       "should_also_have_no_match"))
     self.assertEqual([], found)
     found = list(reader.search("Name", "Sam-S-RE", "Sam-S-RK"))
     expected = [
         SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                      GenomicSegment('2L', 107764, 107838, '+'),
                      GenomicSegment('2L', 108587, 108809, '+'),
                      GenomicSegment('2L', 110405, 110483, '+'),
                      GenomicSegment('2L', 110754, 110877, '+'),
                      GenomicSegment('2L', 111906, 112019, '+'),
                      GenomicSegment('2L', 112689, 113369, '+'),
                      GenomicSegment('2L', 113433, 114432, '+'),
                      Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                      ID='FBtr0089437',
                      Name='Sam-S-RE',
                      color='#000000',
                      gene_id='FBgn0005278',
                      score='0.0',
                      thickend='113542',
                      thickstart='108685',
                      type='exon'),
         SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                      GenomicSegment('2L', 108587, 108809, '+'),
                      GenomicSegment('2L', 110405, 110483, '+'),
                      GenomicSegment('2L', 110754, 111337, '+'),
                      Alias='na',
                      ID='FBtr0308091',
                      Name='Sam-S-RK',
                      color='#000000',
                      gene_id='FBgn0005278',
                      score='0.0',
                      thickend='110900',
                      thickstart='108685',
                      type='exon'),
     ]
     self.assertEqual(expected, found)
Пример #14
0
def fa_to_bed(toomany_fh, k, offset=0):
    """Create a `BED`_ file indicating genomic origins of reads in a `bowtie`_ ``toomany`` file
    
    Parameters
    ----------
    toomany_fh : file-like
        Open filehandle to fasta-formatted ``toomany`` file from `bowtie`_

    k : int
        Length of k-mers

    offset : int, optional
        Offset from 5' end of read at which to map read, if any (Default: `0`)

    Yields
    ------
    |SegmentChain|
        Plus-strand |SegmentChain| representing a repetitive region

    |SegmentChain|
        Minus-strand |SegmentChain| representing a repetitive region
    """
    last_chrom = None
    last_pos = None
    start_pos = None
    reader = FastaNameReader(toomany_fh)

    for n, read_name in enumerate(reader):
        chrom, pos = namepat.search(read_name).groups()
        pos = int(pos) + offset
        if chrom != last_chrom:
            if last_chrom is not None:
                plus_chain = SegmentChain(
                    GenomicSegment(last_chrom, start_pos, last_pos + 1, "+"))
                minus_chain = revcomp_mask_chain(plus_chain, k, offset)
                last_chrom = chrom
                start_pos = pos
                last_pos = pos
                yield plus_chain, minus_chain
            else:
                last_chrom = chrom
                start_pos = pos
                last_pos = pos
        else:
            delta = pos - last_pos
            if delta > 1:
                plus_chain = SegmentChain(
                    GenomicSegment(chrom, start_pos, last_pos + 1, "+"))
                minus_chain = revcomp_mask_chain(plus_chain, k, offset)
                last_pos = pos
                start_pos = pos
                yield plus_chain, minus_chain
            elif delta == 1:
                last_pos = pos
            else:
                msg = "k-mers are not sorted at read %s! Aborting." % read_name
                raise MalformedFileError(toomany_fh, msg, line_num=n)

    # export final feature
    plus_chain = SegmentChain(
        GenomicSegment(chrom, start_pos, last_pos + 1, "+"))
    minus_chain = revcomp_mask_chain(plus_chain, k, offset)
    yield plus_chain, minus_chain
Пример #15
0
                                       ],
                         # these below are all 1 nucleotide outside match range
                        'YBR215W_mRNA_0'  : ['YBR215W_mRNA_0:0-105^189-2175(+)', 'YBR215W_mRNA_0:0-109^193-2175(+)'],
                        'YHL001W_mRNA_0'  : ['YHL001W_mRNA_0:0-143^541-961(+)', 'YHL001W_mRNA_0:0-149^547-961(+)'],
                        'YIL018W_mRNA_0'  : ['YIL018W_mRNA_0:0-28^428-1280(+)', 'YIL018W_mRNA_0:0-34^434-1280(+)'],
                        'YIL133C_mRNA_0'  : ['YIL133C_mRNA_0:0-644^934-1007(-)', 'YIL133C_mRNA_0:0-650^940-1007(-)'],
                        'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-40^102-408(+)', 'YIL156W_B_mRNA_0:0-45^107-408(+)'],
                        'YKL006W_mRNA_0'  : ['YKL006W_mRNA_0:0-154^552-954(+)', 'YKL006W_mRNA_0:0-160^558-954(+)'],
                        'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-324^396-729(-)', 'YMR194C_B_mRNA_0:0-328^400-729(-)'],
                        'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-410^648-697(-)', 'YPL249C_A_mRNA_0:0-417^655-697(-)']                         
                         }
unmatched_query_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in unmatched_query_juncs.items() }
"""Query junctions with no known matches"""

unmatched_noncan_query_juncs = ["YNL130C:0-23^145-180(-)",
                                "YNL130C:0-53^165-180(-)",
                                "YNL130C:0-70^141-180(-)",
                                "YNL130C:0-49^121-180(-)", 
                                ]   
unmatched_noncan_query_juncs = [SegmentChain.from_str(X) for X in unmatched_noncan_query_juncs]
"""Query junctions without canonical splice junctions in the match range"""


repetitive_regions = [
    "YBR215W_mRNA_0:190-193(+)",   # threeprime splice site plus
    "YHL001W_mRNA_0:144-149(+)",   # fiveprime splice site plus
    "YIL133C_mRNA_0:935-940(-)",   # threeprime splice site minus
    "YMR194C_B_mRNA_0:325-328(-)", # fiveprime splice site minus
]
cross_hash = GenomeHash([SegmentChain(GenomicSegment.from_str(X)) for X in repetitive_regions])
cross_hash_seqs = { X.chrom for X in cross_hash.feature_dict.values() }
Пример #16
0
        reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT),
                            extra_columns=14)
        with warnings.catch_warnings(record=True) as warns:
            warnings.simplefilter("always")
            ltmp = list(reader)
            assert_greater_equal(len(warns), 0)


#===============================================================================
# INDEX: test data
#===============================================================================

# test dataset, constructed manually to include various edge cases
_TEST_SEGMENTCHAINS = [
    # single-interval
    SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), ID="IVC1p"),
    SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), ID="IVC1m"),
    # multi-interval
    SegmentChain(GenomicSegment("chrA", 100, 1100, "+"),
                 GenomicSegment("chrA", 2100, 2600, "+"),
                 ID="IVC2p"),
    SegmentChain(GenomicSegment("chrA", 100, 1100, "-"),
                 GenomicSegment("chrA", 2100, 2600, "-"),
                 ID="IVC2m"),
    # multi-interval, with score
    SegmentChain(GenomicSegment("chrA", 100, 1100, "+"),
                 GenomicSegment("chrA", 2100, 2600, "+"),
                 ID="IVC3p",
                 score=500),
    SegmentChain(GenomicSegment("chrA", 100, 1100, "-"),
                 GenomicSegment("chrA", 2100, 2600, "-"),
Пример #17
0
    def test_search_fields_singlevalue(self):
        reader = BigBedReader(self.bb_indexed)
        found = list(reader.search("name", "should_have_no_match"))
        self.assertEqual([], found)

        found = list(reader.search("Name", "Sam-S-RE"))
        expected = [
            SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                         GenomicSegment('2L', 107764, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                         ID='FBtr0089437',
                         Name='Sam-S-RE',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
        ]
        self.assertEqual(expected, found)

        found = list(reader.search("gene_id", "FBgn0005278"))
        expected = [
            SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                         GenomicSegment('2L', 107764, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                         ID='FBtr0089437',
                         Name='Sam-S-RE',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 111337, '+'),
                         Alias='na',
                         ID='FBtr0308091',
                         Name='Sam-S-RK',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='110900',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RB', 'CG2674-RB']'",
                         ID='FBtr0089428',
                         Name='Sam-S-RB',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='112741',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RA', 'CG2674-RA']'",
                         ID='FBtr0089429',
                         Name='Sam-S-RA',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107956, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias='na',
                         ID='FBtr0330656',
                         Name='Sam-S-RL',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='112781',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RH', 'CG2674-RH']'",
                         ID='FBtr0089432',
                         Name='Sam-S-RH',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108101, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RD', 'CG2674-RD']'",
                         ID='FBtr0089430',
                         Name='Sam-S-RD',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108101, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RC', 'CG2674-RC']'",
                         ID='FBtr0089431',
                         Name='Sam-S-RC',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108088, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RF', 'CG2674-RF']'",
                         ID='FBtr0089433',
                         Name='Sam-S-RF',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108132, 108346, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RI', 'CG2674-RI']'",
                         ID='FBtr0089434',
                         Name='Sam-S-RI',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108132, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'",
                         ID='FBtr0089435',
                         Name='Sam-S-RJ',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 109593, 109793, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RG', 'CG2674-RG']'",
                         ID='FBtr0089436',
                         Name='Sam-S-RG',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='109750',
                         type='exon'),
        ]
        self.assertEqual(sorted(expected), sorted(found))
Пример #18
0
CCCTCCTTCCGCTGGCCCCGACTGC
>chr30b:1(+)
CCTCCTTCCGCTGGCCCCGACTGCC
>chr30b:2(+)
CTCCTTCCGCTGGCCCCGACTGCCC
>chr30b:3(+)
TCCTTCCGCTGGCCCCGACTGCCCC
>chr30b:4(+)
CCTTCCGCTGGCCCCGACTGCCCCA
>chr30b:5(+)
CTTCCGCTGGCCCCGACTGCCCCAG
"""

CROSSMAP1 = [
    (
        SegmentChain(GenomicSegment("chr50a", 1, 10, "+")),
        SegmentChain(GenomicSegment("chr50a", 1 + 25 - 1, 10 + 25 - 1, "-")),
    ),
    (
        SegmentChain(GenomicSegment("chr50a", 19, 26, "+")),
        SegmentChain(GenomicSegment("chr50a", 19 + 25 - 1, 26 + 25 - 1, "-")),
    ),
    (
        SegmentChain(GenomicSegment("chr30b", 0, 6, "+")),
        SegmentChain(GenomicSegment("chr30b", 0 + 25 - 1, 6 + 25 - 1, "-")),
    )
]

CROSSMAP2 = [
    (
        SegmentChain(GenomicSegment("chr50a", 1 + 1000, 10 + 1000, "+")),
Пример #19
0
    'YMR194C_B_mRNA_0':
    ['YMR194C_B_mRNA_0:0-324^396-729(-)', 'YMR194C_B_mRNA_0:0-328^400-729(-)'],
    'YPL249C_A_mRNA_0':
    ['YPL249C_A_mRNA_0:0-410^648-697(-)', 'YPL249C_A_mRNA_0:0-417^655-697(-)']
}
unmatched_query_juncs = {
    K: [SegmentChain.from_str(X) for X in V]
    for K, V in unmatched_query_juncs.items()
}
"""Query junctions with no known matches"""

unmatched_noncan_query_juncs = [
    "YNL130C:0-23^145-180(-)",
    "YNL130C:0-53^165-180(-)",
    "YNL130C:0-70^141-180(-)",
    "YNL130C:0-49^121-180(-)",
]
unmatched_noncan_query_juncs = [
    SegmentChain.from_str(X) for X in unmatched_noncan_query_juncs
]
"""Query junctions without canonical splice junctions in the match range"""

repetitive_regions = [
    "YBR215W_mRNA_0:190-193(+)",  # threeprime splice site plus
    "YHL001W_mRNA_0:144-149(+)",  # fiveprime splice site plus
    "YIL133C_mRNA_0:935-940(-)",  # threeprime splice site minus
    "YMR194C_B_mRNA_0:325-328(-)",  # fiveprime splice site minus
]
cross_hash = GenomeHash(
    [SegmentChain(GenomicSegment.from_str(X)) for X in repetitive_regions])
cross_hash_seqs = {X.chrom for X in cross_hash.feature_dict.values()}
Пример #20
0
def test_exit_status():

    # define columns
    cols = {
        "intA"   : numpy.random.randint(0,high=2**16,size=size),
        "intB"   : numpy.random.randint(-10,high=20,size=size),
        "idxA"   : numpy.arange(size),
        "chrA"   : numpy.array([chr(65+(X%(91-65))) for X in range(size)]),
        "strA"   : numpy.array([str(GenomicSegment("chrA",X,X+500,"+")) for X in range(size)]),
        "strB"   : numpy.array([str(GenomicSegment("chrB",X/2,X/2+500,"-")) for X in range(size)]),
        "floatA" : 10*numpy.random.randn(size) + 500,
        "floatB" : (10**-5)*numpy.random.random(size),
        "objA"   : numpy.tile(None,5000),
                "objB"   : numpy.array([GenomicSegment("chrC",X,X+Y,"+") for X,Y in zip(range(size),numpy.random.randint(2,high=1000,size=size))]),
          }

    # allocate temp files we will use
    headerfile            = NamedTemporaryFile(delete=False,mode="w")
    headerfile_extra_cols = NamedTemporaryFile(delete=False,mode="w")
    headerfile_extra_cols_diff = NamedTemporaryFile(delete=False,mode="w")
    headerfile_extra_cols_shuffled = NamedTemporaryFile(delete=False,mode="w")
    headerfile_shuffled   = NamedTemporaryFile(delete=False,mode="w")
    headerfile_diff_vals  = NamedTemporaryFile(delete=False,mode="w")

    noheaderfile            = NamedTemporaryFile(delete=False,mode="w")
    noheaderfile_extra_cols = NamedTemporaryFile(delete=False,mode="w")
    noheaderfile_extra_cols_diff = NamedTemporaryFile(delete=False,mode="w")
    noheaderfile_extra_cols_shuffled = NamedTemporaryFile(delete=False,mode="w")
    noheaderfile_shuffled   = NamedTemporaryFile(delete=False,mode="w")
    noheaderfile_diff_vals  = NamedTemporaryFile(delete=False,mode="w")

    # write values
    keyorder = ["idxA"] + sorted(list(set(cols.keys()) - { "idxA" }))

    table1 = pd.DataFrame(cols)
    table1.to_csv(headerfile,index=False,header=True,sep="\t")
    table1.to_csv(noheaderfile,index=False,header=False,sep="\t",
columns=keyorder)
    headerfile.close()
    noheaderfile.close()

    table1["extra"] = 2**7 * numpy.random.random(size=size)
    table1.to_csv(headerfile_extra_cols,index=False,header=True,sep="\t")
    table1.to_csv(noheaderfile_extra_cols,index=False,header=False,sep="\t",

                   columns=["extra"]+keyorder)
    headerfile_extra_cols.close()
    noheaderfile_extra_cols.close()

    table1["extra"] += 10**-4 * numpy.random.random(size=size)
    table1.to_csv(headerfile_extra_cols_diff,index=False,header=True,sep="\t")
    table1.to_csv(noheaderfile_extra_cols_diff,index=False,header=False,sep="\t",

                   columns=["extra"]+keyorder)
    headerfile_extra_cols_diff.close()
    noheaderfile_extra_cols_diff.close()

    shufidx = numpy.arange(size)
    shuffle(shufidx)
    table2 = pd.DataFrame({ K : V[shufidx] for K,V in cols.items()})
    table2.to_csv(headerfile_shuffled,index=False,header=True,sep="\t")
    table2.to_csv(noheaderfile_shuffled,index=False,header=False,sep="\t",
columns=keyorder)
    headerfile_shuffled.close()
    noheaderfile_shuffled.close()

    table2["extra"] = table1["extra"][shufidx]
    table2.to_csv(headerfile_extra_cols_shuffled,index=False,header=True,sep="\t")
    table2.to_csv(noheaderfile_extra_cols_shuffled,
                  index=False,header=False,sep="\t",
                   columns=["extra"]+keyorder)
    headerfile_extra_cols_shuffled.close()
    noheaderfile_extra_cols_shuffled.close()

    # Define tests, as tuples of:
    #   -Test name/description
    #   -Command-line arguments to pass to :py:mod:`plastid.bin.test_table_equality`
    #   -Expected exit code/returns status for :py:func:`main`
    tests = [
        ("same",
            "%s %s" % (headerfile.name,headerfile.name),
            0),
        ("diff_column_names",
            "%s %s" % (headerfile.name,headerfile_extra_cols.name),
            1),
        ("extra_column_names_ignored",
            "%s %s --exclude extra" % (headerfile.name,headerfile_extra_cols.name),
            0),
        ("shuffled_rows",
            "%s %s" % (headerfile.name,headerfile_shuffled.name),
            1),
        ("shuffled_rows_name_sort",
            "%s %s --sort_keys idxA" % (headerfile.name,headerfile_shuffled.name),
            0),
        ("shuffled_rows_multi_name_sort",
            "%s %s --sort_keys strB chrA" % (headerfile.name,headerfile_shuffled.name),
            0),
        ("same_column_names_diff_values",
            "%s %s" % (headerfile_extra_cols.name,headerfile_extra_cols_diff.name),
            1),
        ("same_column_names_diff_values_tol",
            "%s %s --tol 0.01" % (headerfile_extra_cols.name,headerfile_extra_cols_diff.name),
            0),
        ("same_column_names_diff_values_ignored",
            "%s %s --exclude extra" % (headerfile_extra_cols.name,headerfile_extra_cols_diff.name),
            0),
        ("shuffled_rows_extra_columns_ignored",
            "%s %s --exclude extra" % (headerfile.name,headerfile_extra_cols_shuffled.name),
            1),
        ("shuffled_rows_extra_columns_ignored_name_sort",
            "%s %s --exclude extra --sort_keys idxA" % (headerfile.name,headerfile_extra_cols_shuffled.name),
            0),

        ("noheader_same",
            "%s %s --no_header" % (noheaderfile.name,noheaderfile.name),
            0),
        ("noheader_extra_columns",
            "%s %s --no_header" % (noheaderfile.name,noheaderfile_extra_cols.name),
            1),
        ("noheader_shuffled_rows",
            "%s %s --no_header" % (noheaderfile.name,noheaderfile_shuffled.name),
            1),
        ("noheader_shuffled_rows_int_sort",
            "%s %s --no_header --sort_keys 0" % (noheaderfile.name,noheaderfile_shuffled.name),
            0),
        ("noheader_diff_values",
            "%s %s --no_header" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_diff.name),
            1),
        ("noheader_diff_values_tol",
            "%s %s --no_header --tol 0.01" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_diff.name),
            0),
        ("no_header_diff_values_ignored",
            "%s %s --no_header --exclude 0" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_diff.name),
            0),
        ("no_header_shuffled_rows_extra_columns_ignored_int_sort",
            "%s %s --no_header --exclude 0 --sort_keys 1" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_shuffled.name),
            0)
    ]
    """ Tests to conduct, as tuples of:

        - Test name/description
        - Command-line arguments to pass to :py:mod:`plastid.bin.test_table_equality`
        - Expected exit code/returns status for :py:func:`main`
    """
    for test_name, argstr, expected_exit in tests:
        yield check_exit_status, test_name, argstr, expected_exit

    # clean up
    os.unlink(headerfile.name            )
    os.unlink(headerfile_extra_cols.name )
    os.unlink(headerfile_extra_cols_diff.name )
    os.unlink(headerfile_extra_cols_shuffled.name )
    os.unlink(headerfile_shuffled.name   )
    os.unlink(headerfile_diff_vals.name  )

    os.unlink(noheaderfile.name            )
    os.unlink(noheaderfile_extra_cols.name )
    os.unlink(noheaderfile_extra_cols_diff.name )
    os.unlink(noheaderfile_extra_cols_shuffled.name )
    os.unlink(noheaderfile_shuffled.name   )
    os.unlink(noheaderfile_diff_vals.name  )
    cleanup_resources()
Пример #21
0
 def setUpClass(cls):
     cls.ivcs = {
         "plus": [
             SegmentChain(GenomicSegment("chrA", 0, 100, "+")),
             SegmentChain(GenomicSegment("chrA", 50, 100, "+")),
             SegmentChain(GenomicSegment("chrA", 50, 51, "+"))
         ],
         "minus_k25_off0": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 25 - 1, 100 + 25 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1, 100 + 25 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1, 51 + 25 - 1, "-"))
         ],
         "minus_k50_off0": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 50 - 1, 100 + 50 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1, 100 + 50 - 1, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1, 51 + 50 - 1, "-"))
         ],
         "minus_k25_off10": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 25 - 1 - 2 * 10,
                                100 + 25 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10,
                                100 + 25 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10,
                                51 + 25 - 1 - 2 * 10, "-"))
         ],
         "minus_k50_off10": [
             SegmentChain(
                 GenomicSegment("chrA", 0 + 50 - 1 - 2 * 10,
                                100 + 50 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10,
                                100 + 50 - 1 - 2 * 10, "-")),
             SegmentChain(
                 GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10,
                                51 + 50 - 1 - 2 * 10, "-"))
         ],
     }
Пример #22
0
def find_canonicals_in_range(query_junc,minus_range,plus_range,genome,canonicals):
    """Find any canonical splice junctions within in `minus_range...plus_range`
    of `query_junc`
    
    To be classified as within the range, the boundaries of the canonical
    junction must be:
    
      1. within `minus_range...plus_range` of the boundaries
         of the the discovered junction.
            
      2. separated by a nucleotide distance equal to the distance 
         separating the junction in `query_junc`.
        
      3. On the same chromosome and strand.
        
    
    Parameters
    ----------
    query_junc : |SegmentChain|
         A two-exon fragment representing a query splice junction
         
    minus_range : int <= 0
        Maximum number of nucleotides splice junction could be moved 
        to the left without reducing sequence support for the junction
        see :py:func:`find_match_range`
        
    plus_range : int >= 0
        Maximum number of nucleotides splice junction could be moved 
        to the right without reducing sequence support for the junction
        see :py:func:`find_match_range`
        
    genome : dict
        dict mapping chromosome names to :py:class:`Bio.SeqRecord.SeqRecord` s
        
    canonicals : list
        dinucleotide sequences to consider as canonical splice sites,
        as a list of tuples. e.g. `[("GT","AG"), ("GC","AG")]`

        
    Returns
    -------
    list
        List of |SegmentChains| representing canonical splice junctions in
        `minus_range...plus_range` of `query_junc`
    """    
    ltmp = []
    chrom  = query_junc.chrom
    strand = query_junc.strand
    iv1,iv2 = query_junc[0], query_junc[1]

    iv1start, iv1end = iv1.start, iv1.end
    iv2start, iv2end = iv2.start, iv2.end
    for i in range(minus_range,plus_range+1):
        for pair in canonicals:
            if str(genome[chrom][iv1end + i:iv1end + i + 2].seq) == pair[0]\
            and str(genome[chrom][iv2start - 2 + i:iv2start + i].seq) == pair[1]:
                new_iv1 = GenomicSegment(chrom,
                                          iv1start,
                                          iv1end + i,
                                          strand)
                new_iv2 = GenomicSegment(chrom,
                                          iv2start + i,
                                          iv2end,
                                          strand)
                ltmp.append(SegmentChain(new_iv1,new_iv2))
    
    return ltmp