示例#1
0
    def test_search_fields_singlevalue(self):
        reader = BigBedReader(self.bb_indexed)
        found = list(reader.search("name","should_have_no_match"))
        self.assertEqual([],found)
         
        found = list(reader.search("Name","Sam-S-RE"))
        expected = [
            SegmentChain(GenomicSegment('2L',106902,107000,'+'),GenomicSegment('2L',107764,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RE', 'CG2674-RE']'",ID='FBtr0089437',Name='Sam-S-RE',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
                    ]
        self.assertEqual(expected,found)
 
        found = list(reader.search("gene_id","FBgn0005278"))
        expected = [
            SegmentChain(GenomicSegment('2L',106902,107000,'+'),GenomicSegment('2L',107764,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RE', 'CG2674-RE']'",ID='FBtr0089437',Name='Sam-S-RE',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,111337,'+'),Alias='na',ID='FBtr0308091',Name='Sam-S-RK',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='110900',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114210,'+'),Alias="'['M(2)21AB-RB', 'CG2674-RB']'",ID='FBtr0089428',Name='Sam-S-RB',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='112741',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RA', 'CG2674-RA']'",ID='FBtr0089429',Name='Sam-S-RA',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',107760,107956,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias='na',ID='FBtr0330656',Name='Sam-S-RL',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='112781',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',107936,108226,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114210,'+'),Alias="'['M(2)21AB-RH', 'CG2674-RH']'",ID='FBtr0089432',Name='Sam-S-RH',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',107936,108101,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RD', 'CG2674-RD']'",ID='FBtr0089430',Name='Sam-S-RD',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',107936,108101,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RC', 'CG2674-RC']'",ID='FBtr0089431',Name='Sam-S-RC',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',108088,108226,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RF', 'CG2674-RF']'",ID='FBtr0089433',Name='Sam-S-RF',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',108132,108346,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RI', 'CG2674-RI']'",ID='FBtr0089434',Name='Sam-S-RI',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',108132,108226,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'",ID='FBtr0089435',Name='Sam-S-RJ',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
            SegmentChain(GenomicSegment('2L',109593,109793,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114210,'+'),Alias="'['M(2)21AB-RG', 'CG2674-RG']'",ID='FBtr0089436',Name='Sam-S-RG',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='109750',type='exon')                    ,
        ]
        self.assertEqual(sorted(expected),sorted(found))
示例#2
0
 def test_get_autosql_str(self):
     for k in (4, 12):
         bbplus_as = BigBedReader(self.bb_bonuscols["bb%sas" % k])
         expected_as = open(
             resource_filename(
                 "plastid",
                 "test/data/annotations/bed%s_bonus_bed_columns.as" %
                 k)).read()
         self.assertEqual(bbplus_as._get_autosql_str(), expected_as)
示例#3
0
 def test_search_fields_multivalue(self):
     reader = BigBedReader(self.bb_indexed)
     found = list(reader.search("name","should_have_no_match","should_also_have_no_match"))
     self.assertEqual([],found)
     found = list(reader.search("Name","Sam-S-RE","Sam-S-RK"))
     expected = [
         SegmentChain(GenomicSegment('2L',106902,107000,'+'),GenomicSegment('2L',107764,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RE', 'CG2674-RE']'",ID='FBtr0089437',Name='Sam-S-RE',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'),
         SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,111337,'+'),Alias='na',ID='FBtr0308091',Name='Sam-S-RK',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='110900',thickstart='108685',type='exon'),
     ]
     self.assertEqual(expected,found)
示例#4
0
    def __init__(
            self, *filenames, **kwargs
    ):  #,base_record_format="III",return_type=None,cache_depth=5):
        """Create a |BigBedGenomeHash|
        
        Parameters
        ----------
        *filenames : str 
            One or more filenames to open (NOT open filehandles)

        return_type : class implementing a :py:meth:`from_bed` method
            Class of object to return (Default: |SegmentChain|)
        """
        from plastid.readers.bigbed import BigBedReader
        return_type = kwargs.get("return_type", SegmentChain)

        filenames = list(multiopen(filenames))
        for filename in filenames:
            if not isinstance(filename, str):
                raise ValueError("`filename` must be a 'str'. Found a '%s'." %
                                 type(filename))

        self.filenames = filenames
        self.bigbedreaders = [
            BigBedReader(X, return_type=return_type) for X in filenames
        ]
示例#5
0
 def test_return_type(self):
     bb = self.bbs[12]
     i = iter(bb)
     for _ in range(5):
         self.assertTrue(isinstance(next(i), Transcript))
     ivcbb = BigBedReader(self.bbfiles[12], return_type=SegmentChain)
     i = iter(ivcbb)
     for _ in range(5):
         self.assertTrue(isinstance(next(i), SegmentChain))
示例#6
0
    def setUpClass(cls):
        """Set up test data for `TestGenomeHash`"""
        cls.binsize = 10000

        cls.tx_bbfile     = REF_FILES["100transcripts_bigbed"]
        cls.cds_bbfile    = REF_FILES["100cds_bigbed"]
        cls.as_cds_bbfile = REF_FILES["100cds_antisense_bigbed"]
        
        cls.tx_hash     = BigBedGenomeHash(cls.tx_bbfile)
        cls.cds_hash    = BigBedGenomeHash(cls.cds_bbfile)
        cls.as_cds_hash = BigBedGenomeHash(cls.as_cds_bbfile)
        
        cls.transcripts      = list(BigBedReader(cls.tx_bbfile,return_type=Transcript))
        cls.coding_regions   = list(BigBedReader(cls.cds_bbfile))
        cls.shuffled_indices = list(range(len(cls.transcripts)))
        
        cls.tx_dict  = { X.get_name() : X for X in cls.transcripts }
        cls.cds_dict = { X.get_name() : X for X in cls.coding_regions }
        shuffle(cls.shuffled_indices)
示例#7
0
 def test_custom_columns_names_without_autosql(self):
     expected = OrderedDict([
         ("custom_0", "no description"),
         ("custom_1", "no description"),
         ("custom_2", "no description"),
         ("custom_3", "no description"),
         ("custom_4", "no description"),
     ])
     for k in (4, 12):
         fn = "bb%sno_as" % k
         bb = BigBedReader(self.bb_bonuscols[fn])
         self.assertEqual(bb.extension_fields, expected)
示例#8
0
 def test_custom_columns_names_with_autosql(self):
     expected = OrderedDict([
         ("my_floats", "some float values"),
         ("my_sets", "some set options"),
         ("my_ints", "signed integer values"),
         ("my_strs", "str representation of transcripts"),
         ("my_colors", "r,g,b colors"),
     ])
     for k in (4, 12):
         fn = "bb%sas" % k
         bb = BigBedReader(self.bb_bonuscols[fn])
         self.assertEqual(bb.extension_fields, expected)
示例#9
0
 def test_search_fields_multivalue(self):
     reader = BigBedReader(self.bb_indexed)
     found = list(
         reader.search("name", "should_have_no_match",
                       "should_also_have_no_match"))
     self.assertEqual([], found)
     found = list(reader.search("Name", "Sam-S-RE", "Sam-S-RK"))
     expected = [
         SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                      GenomicSegment('2L', 107764, 107838, '+'),
                      GenomicSegment('2L', 108587, 108809, '+'),
                      GenomicSegment('2L', 110405, 110483, '+'),
                      GenomicSegment('2L', 110754, 110877, '+'),
                      GenomicSegment('2L', 111906, 112019, '+'),
                      GenomicSegment('2L', 112689, 113369, '+'),
                      GenomicSegment('2L', 113433, 114432, '+'),
                      Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                      ID='FBtr0089437',
                      Name='Sam-S-RE',
                      color='#000000',
                      gene_id='FBgn0005278',
                      score='0.0',
                      thickend='113542',
                      thickstart='108685',
                      type='exon'),
         SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                      GenomicSegment('2L', 108587, 108809, '+'),
                      GenomicSegment('2L', 110405, 110483, '+'),
                      GenomicSegment('2L', 110754, 111337, '+'),
                      Alias='na',
                      ID='FBtr0308091',
                      Name='Sam-S-RK',
                      color='#000000',
                      gene_id='FBgn0005278',
                      score='0.0',
                      thickend='110900',
                      thickstart='108685',
                      type='exon'),
     ]
     self.assertEqual(expected, found)
示例#10
0
    def test_iter_same_as_bed_reader_flydata(self):
        # test more complex transcript models
        # we cast them to lists, sadly, because Python's lexical chromosome sorting
        # differs from unix command-line sort; so even though the records are
        # in the same order in both files, they are returned with different sorts
        flybb = BigBedReader(self.flybbfile, return_type=Transcript)
        flybed = BED_Reader(open(self.flybedfile), return_type=Transcript)
        for n, (tx1, tx2) in enumerate(zip(flybed, flybb)):
            msg = "Transcript mismatch in BigBed file at record %s. Expected '%s'. Got '%s'." % (
                n, tx1, tx2)
            self.assertTrue(transcript_identical(tx1, tx2), msg)

        self.assertEqual(n, 32682 - 1)
示例#11
0
    def setUpClass(cls):
        """Set up test data for `TestTabixGenomeHash`"""
 
        cls.tx_file     = REF_FILES["100transcripts_bed_tabix"]
        cls.cds_file    = REF_FILES["100cds_bed_tabix"]
        cls.as_cds_file = REF_FILES["100as_cds_bed_tabix"]

        cls.tx_hash     = TabixGenomeHash(cls.tx_file,data_format="BED")
        cls.cds_hash    = TabixGenomeHash(cls.cds_file,data_format="BED")
        cls.as_cds_hash = TabixGenomeHash(cls.as_cds_file,data_format="BED")

        # use BigBeds as reference        
        # TODO: change to Transcript objects
        cls.tx_bbfile     = REF_FILES["100transcripts_bigbed"]
        cls.cds_bbfile    = REF_FILES["100cds_bigbed"]
        cls.as_cds_bbfile = REF_FILES["100cds_antisense_bigbed"]
         
        cls.transcripts      = list(BigBedReader(cls.tx_bbfile,return_type=Transcript))
        cls.coding_regions   = list(BigBedReader(cls.cds_bbfile))
        cls.shuffled_indices = list(range(len(cls.transcripts)))
         
        cls.tx_dict  = { X.get_name() : X for X in cls.transcripts }
        cls.cds_dict = { X.get_name() : X for X in cls.coding_regions }
        shuffle(cls.shuffled_indices)
示例#12
0
    def test_custom_columns_retval_type_without_autosql(self):
        values = {"custom_%s" % X: copy.deepcopy([]) for X in range(5)}
        bfile = open(self.bonus_col_file)
        for line in bfile:
            items = line.strip("\n").split("\t")
            values["custom_0"].append(items[0])
            values["custom_1"].append(items[1])
            values["custom_2"].append(items[2])
            values["custom_3"].append(items[3])
            values["custom_4"].append(items[4])

        bfile.close()
        for k in (4, 12):
            fn = "bb%sno_as" % k
            bb = BigBedReader(self.bb_bonuscols[fn])
            for n, item in enumerate(bb):
                for key in values:
                    self.assertEqual(values[key][n], item.attr[key])
示例#13
0
    def test_custom_columns_retval_type_with_autosql(self):
        values = {
            "my_floats": [],
            "my_sets": [],
            "my_ints": [],
            "my_strs": [],
            "my_colors": [],
        }
        bfile = open(self.bonus_col_file)
        for line in bfile:
            items = line.strip("\n").split("\t")
            values["my_floats"].append(float(items[0]))
            if items[1] == "":
                values["my_sets"].append(set())
            else:
                values["my_sets"].append(
                    set([X.strip() for X in items[1].split(",")]))
            values["my_ints"].append(int(items[2]))
            values["my_strs"].append(items[3])
            values["my_colors"].append(
                tuple([int(X) for X in items[4].split(",")]))

        bfile.close()
        for k in (4, 12):
            fn = "bb%sas" % k
            # ignore a Warning caused by trying to turn the BED color field
            # to an int- this has to deal with the fact that BedToBigBed wants
            # field 9 (itemRgb, typically uint[3]) to be `reserved uint;` in
            # autoSql declarations
            with warnings.catch_warnings():
                #warnings.simplefilter("ignore")
                bb = BigBedReader(self.bb_bonuscols[fn])
                for n, item in enumerate(bb):
                    for key in values:
                        expected = values[key][n]
                        found = item.attr[key]
                        msg = "failed test_custom_columns_retval_type_with_autosql at record %s, key %s. Expected '%s'. Got '%s' " % (
                            n, key, expected, found)
                        if isinstance(expected, float):
                            assert_almost_equal(expected, found, msg)
                        else:
                            self.assertEqual(expected, found, msg)
示例#14
0
 def test_indexed_fields_as_no_index(self):
     reader = BigBedReader(self.bb_bonuscols["bb4as"])
     self.assertEqual([], reader.indexed_fields)
示例#15
0
 def test_get_no_autosql_str(self):
     for k in (4, 12):
         bbplus_noas = BigBedReader(self.bb_bonuscols["bb%sno_as" % k])
         self.assertEqual(bbplus_noas._get_autosql_str(), "")
示例#16
0
 def test_get_no_autosql_str(self):
     for k in (4,12):
         bbplus_noas = BigBedReader(self.bb_bonuscols["bb%sno_as" % k])
         self.assertEqual(bbplus_noas._get_autosql_str(),"")
示例#17
0
 def test_get_autosql_str(self):
     for k in (4,12): 
         bbplus_as = BigBedReader(self.bb_bonuscols["bb%sas" % k])
         expected_as = open(resource_filename("plastid","test/data/annotations/bed%s_bonus_bed_columns.as" % k)).read()
         self.assertEqual(bbplus_as._get_autosql_str(),expected_as)
示例#18
0
    def setUpClass(cls):
        cls.cols = [3, 4, 5, 6, 8, 9, 12]
        cls.bedfiles = {}
        cls.bbfiles = {}
        for col in cls.cols:
            cls.bedfiles[col] = resource_filename(
                "plastid",
                "test/data/annotations/100transcripts_bed%s.bed" % col)
            cls.bbfiles[col] = resource_filename(
                "plastid",
                "test/data/annotations/100transcripts_bed%s.bb" % col)

        cls.chrom_sizes = {}
        for line in open(
                resource_filename("plastid",
                                  "test/data/annotations/sacCer3.sizes")):
            chrom, size = line.strip().split("\t")
            cls.chrom_sizes[chrom] = int(size)

        cls.bbs = {
            K: BigBedReader(cls.bbfiles[K], return_type=Transcript)
            for K in cls.cols
        }

        # comparisons against genome hash
        cls.binsize = 10000
        transcripts = list(
            BED_Reader(open(cls.bedfiles[12]), return_type=Transcript))

        cls.tx_dict = {}
        cls.cds_dict = {}
        cls.as_cds_dict = {}
        for tx in transcripts:
            txid = tx.get_name()
            cls.tx_dict[txid] = tx
            cds_ivc = tx.get_cds()
            cds_ivc.attr["ID"] = txid
            if cds_ivc.length > 0:
                cls.cds_dict[txid] = tx.get_cds()
                cls.as_cds_dict[txid] = tx.get_cds().get_antisense()
                cls.as_cds_dict[txid].attr["ID"] = txid

        cls.tx_hash = GenomeHash(cls.tx_dict,
                                 do_copy=False,
                                 binsize=cls.binsize)
        cls.cds_hash = GenomeHash(cls.cds_dict,
                                  do_copy=False,
                                  binsize=cls.binsize)
        cls.as_cds_hash = GenomeHash(cls.as_cds_dict,
                                     do_copy=False,
                                     binsize=cls.binsize)

        cls.shuffled_indices = list(range(len(transcripts)))
        shuffle(cls.shuffled_indices)

        cls.flybbfile = resource_filename(
            "plastid", "test/data/annotations/dmel-all-no-analysis-r5.54.bb")
        cls.flybedfile = resource_filename(
            "plastid", "test/data/annotations/dmel-all-no-analysis-r5.54.bed")

        # BigBed files with and without extra columns, with and without autoSql descriptions
        cls.bb_bonuscols = {
            "bb4as":
            resource_filename(
                "plastid",
                "test/data/annotations/100transcripts_bed4plus_bonus_as.bb"),
            "bb12as":
            resource_filename(
                "plastid",
                "test/data/annotations/100transcripts_bed12plus_bonus_as.bb"),
            "bb4no_as":
            resource_filename(
                "plastid",
                "test/data/annotations/100transcripts_bed4plus_bonus_no_as.bb"
            ),
            "bb12no_as":
            resource_filename(
                "plastid",
                "test/data/annotations/100transcripts_bed12plus_bonus_no_as.bb"
            ),
        }
        cls.bonus_col_file = resource_filename(
            "plastid", "test/data/annotations/bonus_bed_columns.txt")

        # BigBed file with indexes
        cls.bb_indexed = resource_filename(
            "plastid", "test/data/annotations/dmel-bonus-cols.bb")
示例#19
0
    def test_search_fields_singlevalue(self):
        reader = BigBedReader(self.bb_indexed)
        found = list(reader.search("name", "should_have_no_match"))
        self.assertEqual([], found)

        found = list(reader.search("Name", "Sam-S-RE"))
        expected = [
            SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                         GenomicSegment('2L', 107764, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                         ID='FBtr0089437',
                         Name='Sam-S-RE',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
        ]
        self.assertEqual(expected, found)

        found = list(reader.search("gene_id", "FBgn0005278"))
        expected = [
            SegmentChain(GenomicSegment('2L', 106902, 107000, '+'),
                         GenomicSegment('2L', 107764, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RE', 'CG2674-RE']'",
                         ID='FBtr0089437',
                         Name='Sam-S-RE',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 111337, '+'),
                         Alias='na',
                         ID='FBtr0308091',
                         Name='Sam-S-RK',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='110900',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RB', 'CG2674-RB']'",
                         ID='FBtr0089428',
                         Name='Sam-S-RB',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='112741',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107838, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RA', 'CG2674-RA']'",
                         ID='FBtr0089429',
                         Name='Sam-S-RA',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107760, 107956, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias='na',
                         ID='FBtr0330656',
                         Name='Sam-S-RL',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='112781',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RH', 'CG2674-RH']'",
                         ID='FBtr0089432',
                         Name='Sam-S-RH',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108101, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RD', 'CG2674-RD']'",
                         ID='FBtr0089430',
                         Name='Sam-S-RD',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 107936, 108101, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RC', 'CG2674-RC']'",
                         ID='FBtr0089431',
                         Name='Sam-S-RC',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108088, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RF', 'CG2674-RF']'",
                         ID='FBtr0089433',
                         Name='Sam-S-RF',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108132, 108346, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111906, 112019, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RI', 'CG2674-RI']'",
                         ID='FBtr0089434',
                         Name='Sam-S-RI',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 108132, 108226, '+'),
                         GenomicSegment('2L', 108587, 108809, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114432, '+'),
                         Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'",
                         ID='FBtr0089435',
                         Name='Sam-S-RJ',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='108685',
                         type='exon'),
            SegmentChain(GenomicSegment('2L', 109593, 109793, '+'),
                         GenomicSegment('2L', 110405, 110483, '+'),
                         GenomicSegment('2L', 110754, 110877, '+'),
                         GenomicSegment('2L', 111004, 111117, '+'),
                         GenomicSegment('2L', 112689, 113369, '+'),
                         GenomicSegment('2L', 113433, 114210, '+'),
                         Alias="'['M(2)21AB-RG', 'CG2674-RG']'",
                         ID='FBtr0089436',
                         Name='Sam-S-RG',
                         color='#000000',
                         gene_id='FBgn0005278',
                         score='0.0',
                         thickend='113542',
                         thickstart='109750',
                         type='exon'),
        ]
        self.assertEqual(sorted(expected), sorted(found))
示例#20
0
 def test_indexed_fields(self):
     reader = BigBedReader(self.bb_indexed)
     self.assertEqual(sorted(["gene_id", "name", "Name", "Alias"]),
                      sorted(reader.indexed_fields))
示例#21
0
 def test_search_fields_invalid_raises_error(self):
     reader = BigBedReader(self.bb_indexed)
     self.assertRaises(KeyError, reader.search, "garbage_field",
                       "garbage_value")
示例#22
0
文件: cs.py 项目: zzygyx9119/plastid
def do_generate(args, annotation_parser, mask_parser):
    """Generate gene position files from gene annotations.
    
     1. Genes whose transcripts share exons are first collapsed into merged
        genes.
        
     2. Within merged genes, all positions are classified. All positions are
        included in a set called *exon*. All positions that appear as coding
        regions in all transcripts (i.e. are never part of a 5'UTR or 3'UTR)
        included in a set called *CDS*. Similarly, all positions that appear
        as 5' UTR or 3' UTR in all transcripts are included in sets called
        *UTR5* or *UTR3*, respectively.
    
     3. Genomic positions that are overlapped by multiple merged genes are
        excluded from the position sets for those genes.
    
     4. If a :term:`mask file` is supplied, positions annotated in the mask file
        are also excluded
    
     5. Output is given as a series of `BED`_ files and a `positions` file
        containing the same data.
    
    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``generate`` subprogram
    """
    # variables for transcript <-> merged gene mapping
    transcripts = {}
    merged_genes = {}

    # data table for merged genes
    gene_table = pd.DataFrame({
        "region": [],
        "transcript_ids": [],
        "exon_unmasked": [],
        "exon": [],
        "masked": [],
        "utr5": [],
        "cds": [],
        "utr3": [],
        "exon_bed": [],
        "utr5_bed": [],
        "cds_bed": [],
        "utr3_bed": [],
        "masked_bed": [],
    })

    # data table for transcripts
    transcript_table = pd.DataFrame({
        "region": [],
        "exon": [],
        "utr5": [],
        "cds": [],
        "utr3": [],
        "exon_bed": [],
        "utr5_bed": [],
        "cds_bed": [],
        "utr3_bed": [],
        "masked": [],
        "exon_unmasked": [],
        "transcript_ids": [],
        "masked_bed": [],
    })

    # data
    is_sorted = (args.sorted == True) or \
                (args.tabix == True) or \
                (args.annotation_format == "BigBed")

    annotation_message = """`cs` relies upon relationships between
    transcripts and genes to collapse transcripts to genes for quantitation.
    Gene-transcript relationships are not generally preserved in BED or BigBed
    files, and a `gene_id` column could not be found in the input data. This
    may yield nonsensical results in the output.

    Consider either (1) using a GTF2 or GFF3 file or (2) creating an extended
    BED or BigBed file with a `gene_id` column.""".replace("    ", "").replace(
        "\n", " ")

    if args.annotation_format == "BED":
        if not isinstance(args.bed_extra_columns,
                          list) or 'gene_id' not in args.bed_extra_columns:
            warnings.warn(annotation_message, FileFormatWarning)
    elif args.annotation_format == "BigBed":
        reader = BigBedReader(args.annotation_files[0])
        if 'gene_id' not in reader.extension_fields:
            warnings.warn(annotation_message, FileFormatWarning)

    source = annotation_parser.get_transcripts_from_args(args, printer=printer)
    mask_hash = mask_parser.get_genome_hash_from_args(args)

    # loop conditions
    last_chrom = None
    do_loop = True

    # to save memory, we process one chromosome at a time if input file is sorted
    # knowing that at that moment all transcript parts are assembled
    while do_loop == True:
        try:
            tx = next(source)
        except StopIteration:
            do_loop = False

        try:
            # if chromosome is completely processed or EOF
            if (is_sorted and tx.spanning_segment.chrom != last_chrom
                ) or do_loop == False:
                if do_loop == True:
                    source = itertools.chain([tx], source)

                if last_chrom is not None or do_loop == False:
                    printer.write("Merging genes on chromosome/contig '%s'" %
                                  last_chrom)
                    my_gene_table, my_transcript_table, my_merged_genes = process_partial_group(
                        transcripts, mask_hash, printer)
                    gene_table = pd.concat((gene_table, my_gene_table), axis=0)
                    transcript_table = pd.concat(
                        (transcript_table, my_transcript_table), axis=0)
                    merged_genes.update(my_merged_genes)

                del transcripts
                gc.collect()
                del gc.garbage[:]
                transcripts = {}

                # reset last chrom
                last_chrom = tx.spanning_segment.chrom

            # otherwise, remember transcript
            else:
                transcripts[tx.get_name()] = tx

        # exit gracefully if no transcripts found
        except UnboundLocalError:
            pass

    # write output
    printer.write("Writing output ...")

    merged_fn = "%s_merged.txt" % args.outbase
    number_merged = len(set(merged_genes.values()))
    printer.write("Collapsed %s genes to %s merged groups. Writing to %s" %
                  (len(merged_genes), number_merged, merged_fn))
    fout = argsopener(merged_fn, args, "w")
    for gene, merged_name in sorted(merged_genes.items()):
        fout.write("%s\t%s\n" % (gene, merged_name))

    fout.close()

    printer.write("Writing gene table and BED files ...")
    write_output_files(gene_table, "gene", args)
    printer.write("Writing transcript summary table and BED files ...")
    write_output_files(transcript_table, "transcript", args)
    printer.write("Done!")