def test_search_fields_singlevalue(self): reader = BigBedReader(self.bb_indexed) found = list(reader.search("name","should_have_no_match")) self.assertEqual([],found) found = list(reader.search("Name","Sam-S-RE")) expected = [ SegmentChain(GenomicSegment('2L',106902,107000,'+'),GenomicSegment('2L',107764,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RE', 'CG2674-RE']'",ID='FBtr0089437',Name='Sam-S-RE',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), ] self.assertEqual(expected,found) found = list(reader.search("gene_id","FBgn0005278")) expected = [ SegmentChain(GenomicSegment('2L',106902,107000,'+'),GenomicSegment('2L',107764,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RE', 'CG2674-RE']'",ID='FBtr0089437',Name='Sam-S-RE',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,111337,'+'),Alias='na',ID='FBtr0308091',Name='Sam-S-RK',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='110900',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114210,'+'),Alias="'['M(2)21AB-RB', 'CG2674-RB']'",ID='FBtr0089428',Name='Sam-S-RB',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='112741',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RA', 'CG2674-RA']'",ID='FBtr0089429',Name='Sam-S-RA',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107760,107956,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias='na',ID='FBtr0330656',Name='Sam-S-RL',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='112781',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107936,108226,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114210,'+'),Alias="'['M(2)21AB-RH', 'CG2674-RH']'",ID='FBtr0089432',Name='Sam-S-RH',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107936,108101,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RD', 'CG2674-RD']'",ID='FBtr0089430',Name='Sam-S-RD',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107936,108101,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RC', 'CG2674-RC']'",ID='FBtr0089431',Name='Sam-S-RC',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',108088,108226,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RF', 'CG2674-RF']'",ID='FBtr0089433',Name='Sam-S-RF',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',108132,108346,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RI', 'CG2674-RI']'",ID='FBtr0089434',Name='Sam-S-RI',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',108132,108226,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'",ID='FBtr0089435',Name='Sam-S-RJ',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',109593,109793,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111004,111117,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114210,'+'),Alias="'['M(2)21AB-RG', 'CG2674-RG']'",ID='FBtr0089436',Name='Sam-S-RG',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='109750',type='exon') , ] self.assertEqual(sorted(expected),sorted(found))
def test_get_autosql_str(self): for k in (4, 12): bbplus_as = BigBedReader(self.bb_bonuscols["bb%sas" % k]) expected_as = open( resource_filename( "plastid", "test/data/annotations/bed%s_bonus_bed_columns.as" % k)).read() self.assertEqual(bbplus_as._get_autosql_str(), expected_as)
def test_search_fields_multivalue(self): reader = BigBedReader(self.bb_indexed) found = list(reader.search("name","should_have_no_match","should_also_have_no_match")) self.assertEqual([],found) found = list(reader.search("Name","Sam-S-RE","Sam-S-RK")) expected = [ SegmentChain(GenomicSegment('2L',106902,107000,'+'),GenomicSegment('2L',107764,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,110877,'+'),GenomicSegment('2L',111906,112019,'+'),GenomicSegment('2L',112689,113369,'+'),GenomicSegment('2L',113433,114432,'+'),Alias="'['M(2)21AB-RE', 'CG2674-RE']'",ID='FBtr0089437',Name='Sam-S-RE',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='113542',thickstart='108685',type='exon'), SegmentChain(GenomicSegment('2L',107760,107838,'+'),GenomicSegment('2L',108587,108809,'+'),GenomicSegment('2L',110405,110483,'+'),GenomicSegment('2L',110754,111337,'+'),Alias='na',ID='FBtr0308091',Name='Sam-S-RK',color='#000000',gene_id='FBgn0005278',score='0.0',thickend='110900',thickstart='108685',type='exon'), ] self.assertEqual(expected,found)
def __init__( self, *filenames, **kwargs ): #,base_record_format="III",return_type=None,cache_depth=5): """Create a |BigBedGenomeHash| Parameters ---------- *filenames : str One or more filenames to open (NOT open filehandles) return_type : class implementing a :py:meth:`from_bed` method Class of object to return (Default: |SegmentChain|) """ from plastid.readers.bigbed import BigBedReader return_type = kwargs.get("return_type", SegmentChain) filenames = list(multiopen(filenames)) for filename in filenames: if not isinstance(filename, str): raise ValueError("`filename` must be a 'str'. Found a '%s'." % type(filename)) self.filenames = filenames self.bigbedreaders = [ BigBedReader(X, return_type=return_type) for X in filenames ]
def test_return_type(self): bb = self.bbs[12] i = iter(bb) for _ in range(5): self.assertTrue(isinstance(next(i), Transcript)) ivcbb = BigBedReader(self.bbfiles[12], return_type=SegmentChain) i = iter(ivcbb) for _ in range(5): self.assertTrue(isinstance(next(i), SegmentChain))
def setUpClass(cls): """Set up test data for `TestGenomeHash`""" cls.binsize = 10000 cls.tx_bbfile = REF_FILES["100transcripts_bigbed"] cls.cds_bbfile = REF_FILES["100cds_bigbed"] cls.as_cds_bbfile = REF_FILES["100cds_antisense_bigbed"] cls.tx_hash = BigBedGenomeHash(cls.tx_bbfile) cls.cds_hash = BigBedGenomeHash(cls.cds_bbfile) cls.as_cds_hash = BigBedGenomeHash(cls.as_cds_bbfile) cls.transcripts = list(BigBedReader(cls.tx_bbfile,return_type=Transcript)) cls.coding_regions = list(BigBedReader(cls.cds_bbfile)) cls.shuffled_indices = list(range(len(cls.transcripts))) cls.tx_dict = { X.get_name() : X for X in cls.transcripts } cls.cds_dict = { X.get_name() : X for X in cls.coding_regions } shuffle(cls.shuffled_indices)
def test_custom_columns_names_without_autosql(self): expected = OrderedDict([ ("custom_0", "no description"), ("custom_1", "no description"), ("custom_2", "no description"), ("custom_3", "no description"), ("custom_4", "no description"), ]) for k in (4, 12): fn = "bb%sno_as" % k bb = BigBedReader(self.bb_bonuscols[fn]) self.assertEqual(bb.extension_fields, expected)
def test_custom_columns_names_with_autosql(self): expected = OrderedDict([ ("my_floats", "some float values"), ("my_sets", "some set options"), ("my_ints", "signed integer values"), ("my_strs", "str representation of transcripts"), ("my_colors", "r,g,b colors"), ]) for k in (4, 12): fn = "bb%sas" % k bb = BigBedReader(self.bb_bonuscols[fn]) self.assertEqual(bb.extension_fields, expected)
def test_search_fields_multivalue(self): reader = BigBedReader(self.bb_indexed) found = list( reader.search("name", "should_have_no_match", "should_also_have_no_match")) self.assertEqual([], found) found = list(reader.search("Name", "Sam-S-RE", "Sam-S-RK")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 111337, '+'), Alias='na', ID='FBtr0308091', Name='Sam-S-RK', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='110900', thickstart='108685', type='exon'), ] self.assertEqual(expected, found)
def test_iter_same_as_bed_reader_flydata(self): # test more complex transcript models # we cast them to lists, sadly, because Python's lexical chromosome sorting # differs from unix command-line sort; so even though the records are # in the same order in both files, they are returned with different sorts flybb = BigBedReader(self.flybbfile, return_type=Transcript) flybed = BED_Reader(open(self.flybedfile), return_type=Transcript) for n, (tx1, tx2) in enumerate(zip(flybed, flybb)): msg = "Transcript mismatch in BigBed file at record %s. Expected '%s'. Got '%s'." % ( n, tx1, tx2) self.assertTrue(transcript_identical(tx1, tx2), msg) self.assertEqual(n, 32682 - 1)
def setUpClass(cls): """Set up test data for `TestTabixGenomeHash`""" cls.tx_file = REF_FILES["100transcripts_bed_tabix"] cls.cds_file = REF_FILES["100cds_bed_tabix"] cls.as_cds_file = REF_FILES["100as_cds_bed_tabix"] cls.tx_hash = TabixGenomeHash(cls.tx_file,data_format="BED") cls.cds_hash = TabixGenomeHash(cls.cds_file,data_format="BED") cls.as_cds_hash = TabixGenomeHash(cls.as_cds_file,data_format="BED") # use BigBeds as reference # TODO: change to Transcript objects cls.tx_bbfile = REF_FILES["100transcripts_bigbed"] cls.cds_bbfile = REF_FILES["100cds_bigbed"] cls.as_cds_bbfile = REF_FILES["100cds_antisense_bigbed"] cls.transcripts = list(BigBedReader(cls.tx_bbfile,return_type=Transcript)) cls.coding_regions = list(BigBedReader(cls.cds_bbfile)) cls.shuffled_indices = list(range(len(cls.transcripts))) cls.tx_dict = { X.get_name() : X for X in cls.transcripts } cls.cds_dict = { X.get_name() : X for X in cls.coding_regions } shuffle(cls.shuffled_indices)
def test_custom_columns_retval_type_without_autosql(self): values = {"custom_%s" % X: copy.deepcopy([]) for X in range(5)} bfile = open(self.bonus_col_file) for line in bfile: items = line.strip("\n").split("\t") values["custom_0"].append(items[0]) values["custom_1"].append(items[1]) values["custom_2"].append(items[2]) values["custom_3"].append(items[3]) values["custom_4"].append(items[4]) bfile.close() for k in (4, 12): fn = "bb%sno_as" % k bb = BigBedReader(self.bb_bonuscols[fn]) for n, item in enumerate(bb): for key in values: self.assertEqual(values[key][n], item.attr[key])
def test_custom_columns_retval_type_with_autosql(self): values = { "my_floats": [], "my_sets": [], "my_ints": [], "my_strs": [], "my_colors": [], } bfile = open(self.bonus_col_file) for line in bfile: items = line.strip("\n").split("\t") values["my_floats"].append(float(items[0])) if items[1] == "": values["my_sets"].append(set()) else: values["my_sets"].append( set([X.strip() for X in items[1].split(",")])) values["my_ints"].append(int(items[2])) values["my_strs"].append(items[3]) values["my_colors"].append( tuple([int(X) for X in items[4].split(",")])) bfile.close() for k in (4, 12): fn = "bb%sas" % k # ignore a Warning caused by trying to turn the BED color field # to an int- this has to deal with the fact that BedToBigBed wants # field 9 (itemRgb, typically uint[3]) to be `reserved uint;` in # autoSql declarations with warnings.catch_warnings(): #warnings.simplefilter("ignore") bb = BigBedReader(self.bb_bonuscols[fn]) for n, item in enumerate(bb): for key in values: expected = values[key][n] found = item.attr[key] msg = "failed test_custom_columns_retval_type_with_autosql at record %s, key %s. Expected '%s'. Got '%s' " % ( n, key, expected, found) if isinstance(expected, float): assert_almost_equal(expected, found, msg) else: self.assertEqual(expected, found, msg)
def test_indexed_fields_as_no_index(self): reader = BigBedReader(self.bb_bonuscols["bb4as"]) self.assertEqual([], reader.indexed_fields)
def test_get_no_autosql_str(self): for k in (4, 12): bbplus_noas = BigBedReader(self.bb_bonuscols["bb%sno_as" % k]) self.assertEqual(bbplus_noas._get_autosql_str(), "")
def test_get_no_autosql_str(self): for k in (4,12): bbplus_noas = BigBedReader(self.bb_bonuscols["bb%sno_as" % k]) self.assertEqual(bbplus_noas._get_autosql_str(),"")
def test_get_autosql_str(self): for k in (4,12): bbplus_as = BigBedReader(self.bb_bonuscols["bb%sas" % k]) expected_as = open(resource_filename("plastid","test/data/annotations/bed%s_bonus_bed_columns.as" % k)).read() self.assertEqual(bbplus_as._get_autosql_str(),expected_as)
def setUpClass(cls): cls.cols = [3, 4, 5, 6, 8, 9, 12] cls.bedfiles = {} cls.bbfiles = {} for col in cls.cols: cls.bedfiles[col] = resource_filename( "plastid", "test/data/annotations/100transcripts_bed%s.bed" % col) cls.bbfiles[col] = resource_filename( "plastid", "test/data/annotations/100transcripts_bed%s.bb" % col) cls.chrom_sizes = {} for line in open( resource_filename("plastid", "test/data/annotations/sacCer3.sizes")): chrom, size = line.strip().split("\t") cls.chrom_sizes[chrom] = int(size) cls.bbs = { K: BigBedReader(cls.bbfiles[K], return_type=Transcript) for K in cls.cols } # comparisons against genome hash cls.binsize = 10000 transcripts = list( BED_Reader(open(cls.bedfiles[12]), return_type=Transcript)) cls.tx_dict = {} cls.cds_dict = {} cls.as_cds_dict = {} for tx in transcripts: txid = tx.get_name() cls.tx_dict[txid] = tx cds_ivc = tx.get_cds() cds_ivc.attr["ID"] = txid if cds_ivc.length > 0: cls.cds_dict[txid] = tx.get_cds() cls.as_cds_dict[txid] = tx.get_cds().get_antisense() cls.as_cds_dict[txid].attr["ID"] = txid cls.tx_hash = GenomeHash(cls.tx_dict, do_copy=False, binsize=cls.binsize) cls.cds_hash = GenomeHash(cls.cds_dict, do_copy=False, binsize=cls.binsize) cls.as_cds_hash = GenomeHash(cls.as_cds_dict, do_copy=False, binsize=cls.binsize) cls.shuffled_indices = list(range(len(transcripts))) shuffle(cls.shuffled_indices) cls.flybbfile = resource_filename( "plastid", "test/data/annotations/dmel-all-no-analysis-r5.54.bb") cls.flybedfile = resource_filename( "plastid", "test/data/annotations/dmel-all-no-analysis-r5.54.bed") # BigBed files with and without extra columns, with and without autoSql descriptions cls.bb_bonuscols = { "bb4as": resource_filename( "plastid", "test/data/annotations/100transcripts_bed4plus_bonus_as.bb"), "bb12as": resource_filename( "plastid", "test/data/annotations/100transcripts_bed12plus_bonus_as.bb"), "bb4no_as": resource_filename( "plastid", "test/data/annotations/100transcripts_bed4plus_bonus_no_as.bb" ), "bb12no_as": resource_filename( "plastid", "test/data/annotations/100transcripts_bed12plus_bonus_no_as.bb" ), } cls.bonus_col_file = resource_filename( "plastid", "test/data/annotations/bonus_bed_columns.txt") # BigBed file with indexes cls.bb_indexed = resource_filename( "plastid", "test/data/annotations/dmel-bonus-cols.bb")
def test_search_fields_singlevalue(self): reader = BigBedReader(self.bb_indexed) found = list(reader.search("name", "should_have_no_match")) self.assertEqual([], found) found = list(reader.search("Name", "Sam-S-RE")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), ] self.assertEqual(expected, found) found = list(reader.search("gene_id", "FBgn0005278")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 111337, '+'), Alias='na', ID='FBtr0308091', Name='Sam-S-RK', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='110900', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RB', 'CG2674-RB']'", ID='FBtr0089428', Name='Sam-S-RB', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='112741', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RA', 'CG2674-RA']'", ID='FBtr0089429', Name='Sam-S-RA', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107956, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias='na', ID='FBtr0330656', Name='Sam-S-RL', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='112781', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RH', 'CG2674-RH']'", ID='FBtr0089432', Name='Sam-S-RH', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108101, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RD', 'CG2674-RD']'", ID='FBtr0089430', Name='Sam-S-RD', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108101, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RC', 'CG2674-RC']'", ID='FBtr0089431', Name='Sam-S-RC', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108088, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RF', 'CG2674-RF']'", ID='FBtr0089433', Name='Sam-S-RF', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108132, 108346, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RI', 'CG2674-RI']'", ID='FBtr0089434', Name='Sam-S-RI', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108132, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'", ID='FBtr0089435', Name='Sam-S-RJ', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 109593, 109793, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RG', 'CG2674-RG']'", ID='FBtr0089436', Name='Sam-S-RG', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='109750', type='exon'), ] self.assertEqual(sorted(expected), sorted(found))
def test_indexed_fields(self): reader = BigBedReader(self.bb_indexed) self.assertEqual(sorted(["gene_id", "name", "Name", "Alias"]), sorted(reader.indexed_fields))
def test_search_fields_invalid_raises_error(self): reader = BigBedReader(self.bb_indexed) self.assertRaises(KeyError, reader.search, "garbage_field", "garbage_value")
def do_generate(args, annotation_parser, mask_parser): """Generate gene position files from gene annotations. 1. Genes whose transcripts share exons are first collapsed into merged genes. 2. Within merged genes, all positions are classified. All positions are included in a set called *exon*. All positions that appear as coding regions in all transcripts (i.e. are never part of a 5'UTR or 3'UTR) included in a set called *CDS*. Similarly, all positions that appear as 5' UTR or 3' UTR in all transcripts are included in sets called *UTR5* or *UTR3*, respectively. 3. Genomic positions that are overlapped by multiple merged genes are excluded from the position sets for those genes. 4. If a :term:`mask file` is supplied, positions annotated in the mask file are also excluded 5. Output is given as a series of `BED`_ files and a `positions` file containing the same data. Parameters ---------- args : :py:class:`argparse.Namespace` command-line arguments for ``generate`` subprogram """ # variables for transcript <-> merged gene mapping transcripts = {} merged_genes = {} # data table for merged genes gene_table = pd.DataFrame({ "region": [], "transcript_ids": [], "exon_unmasked": [], "exon": [], "masked": [], "utr5": [], "cds": [], "utr3": [], "exon_bed": [], "utr5_bed": [], "cds_bed": [], "utr3_bed": [], "masked_bed": [], }) # data table for transcripts transcript_table = pd.DataFrame({ "region": [], "exon": [], "utr5": [], "cds": [], "utr3": [], "exon_bed": [], "utr5_bed": [], "cds_bed": [], "utr3_bed": [], "masked": [], "exon_unmasked": [], "transcript_ids": [], "masked_bed": [], }) # data is_sorted = (args.sorted == True) or \ (args.tabix == True) or \ (args.annotation_format == "BigBed") annotation_message = """`cs` relies upon relationships between transcripts and genes to collapse transcripts to genes for quantitation. Gene-transcript relationships are not generally preserved in BED or BigBed files, and a `gene_id` column could not be found in the input data. This may yield nonsensical results in the output. Consider either (1) using a GTF2 or GFF3 file or (2) creating an extended BED or BigBed file with a `gene_id` column.""".replace(" ", "").replace( "\n", " ") if args.annotation_format == "BED": if not isinstance(args.bed_extra_columns, list) or 'gene_id' not in args.bed_extra_columns: warnings.warn(annotation_message, FileFormatWarning) elif args.annotation_format == "BigBed": reader = BigBedReader(args.annotation_files[0]) if 'gene_id' not in reader.extension_fields: warnings.warn(annotation_message, FileFormatWarning) source = annotation_parser.get_transcripts_from_args(args, printer=printer) mask_hash = mask_parser.get_genome_hash_from_args(args) # loop conditions last_chrom = None do_loop = True # to save memory, we process one chromosome at a time if input file is sorted # knowing that at that moment all transcript parts are assembled while do_loop == True: try: tx = next(source) except StopIteration: do_loop = False try: # if chromosome is completely processed or EOF if (is_sorted and tx.spanning_segment.chrom != last_chrom ) or do_loop == False: if do_loop == True: source = itertools.chain([tx], source) if last_chrom is not None or do_loop == False: printer.write("Merging genes on chromosome/contig '%s'" % last_chrom) my_gene_table, my_transcript_table, my_merged_genes = process_partial_group( transcripts, mask_hash, printer) gene_table = pd.concat((gene_table, my_gene_table), axis=0) transcript_table = pd.concat( (transcript_table, my_transcript_table), axis=0) merged_genes.update(my_merged_genes) del transcripts gc.collect() del gc.garbage[:] transcripts = {} # reset last chrom last_chrom = tx.spanning_segment.chrom # otherwise, remember transcript else: transcripts[tx.get_name()] = tx # exit gracefully if no transcripts found except UnboundLocalError: pass # write output printer.write("Writing output ...") merged_fn = "%s_merged.txt" % args.outbase number_merged = len(set(merged_genes.values())) printer.write("Collapsed %s genes to %s merged groups. Writing to %s" % (len(merged_genes), number_merged, merged_fn)) fout = argsopener(merged_fn, args, "w") for gene, merged_name in sorted(merged_genes.items()): fout.write("%s\t%s\n" % (gene, merged_name)) fout.close() printer.write("Writing gene table and BED files ...") write_output_files(gene_table, "gene", args) printer.write("Writing transcript summary table and BED files ...") write_output_files(transcript_table, "transcript", args) printer.write("Done!")