def test_RNA_position_placement_split(self): """ Makes sure that lists of regions works for both positive and negative strands """ tool = pybedtools.create_interval_from_list("chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 + 125 125".split()) location_dict = {"ENSMUSG1" : {"strand" : "+", "regions" : [(0, 50), (100, 150), ] } } self.assertEqual(RNA_position(tool, location_dict), (.50, .75) ) tool = pybedtools.create_interval_from_list("chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 25 25".split()) location_dict = {"ENSMUSG1" : {"strand" : "-", "regions" : [(100, 150), (0, 50), ] } } self.assertEqual(RNA_position(tool, location_dict), (.50, .75))
def test_RNA_position_placement(self): """ Makes sure that the placement within a region or list of regions is correct """ tool = pybedtools.create_interval_from_list("chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 + 60 60".split()) location_dict = {"ENSMUSG1" : {"strand" : "+", "regions" : [(0,100), ] } } self.assertEqual(RNA_position(tool, location_dict), (.60, .60)) tool = pybedtools.create_interval_from_list("chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 60 60".split()) location_dict = {"ENSMUSG1" : {"strand" : "-", "regions" : [(0,100), ] } } #individual_fraction, total_fraction self.assertEqual(RNA_position(tool, location_dict), (.4, .4))
def create_bed_tool_from_miso_a5ss(miso_annotation, is_alt=True): """ Deprecated function """ if is_alt == True: # format is: chr2:183800103:183799993|183800021:-@chr2:183799480:183799560:- chrom, start, end, strand = miso_annotation.split(':') end1, end2 = end.split('|') if (strand == '+'): splice1 = bt.create_interval_from_list( [chrom, int(start) - 1, end1, '0', '0', strand]) splice2 = bt.create_interval_from_list( [chrom, int(end1) - 1, end2, '0', '0', strand]) # middle else: splice1 = bt.create_interval_from_list( [chrom, int(end2) - 1, start, '0', '0', strand]) splice2 = bt.create_interval_from_list( [chrom, int(end1) - 1, end2, '0', '0', strand]) # middle return splice1, splice2 else: # format is: chr17:80008538:80008640:- chrom, start, end, strand = miso_annotation.split(':') some_bedtool = bt.create_interval_from_list( [chrom, int(start) - 1, end, '0', '0', strand]) return some_bedtool
def test_pickleable(): interval = pybedtools.create_interval_from_list( ['chr1', '1', '100', 'asdf']) fn = pybedtools.BedTool._tmp() import pickle out = open(fn, 'w') pickle.dump(interval, out) out.close() new_interval = pickle.load(open(fn)) assert str(interval) == str(new_interval) interval = pybedtools.create_interval_from_list( ['chr1', '1', '100']) fn = pybedtools.BedTool._tmp() import pickle out = open(fn, 'w') pickle.dump(interval, out) out.close() new_interval = pickle.load(open(fn)) assert str(interval) == str(new_interval) interval = pybedtools.create_interval_from_list( "chr2L . UTR 41 70 0 + . ID=mRNA:xs2:UTR:41-70;Parent=mRNA:xs2;".split('\t')) fn = pybedtools.BedTool._tmp() import pickle out = open(fn, 'w') pickle.dump(interval, out) out.close() new_interval = pickle.load(open(fn)) assert str(interval) == str(new_interval)
def test_second_start_segmentation(self): segmentation = { 'G001': { 'gene_segment': [], 'T0001': [ pybedtools.create_interval_from_list( ['1', '.', 'exon', '100', '200', '.', '+', '.', 'gene_id: "G001"', 'transcript_id: "T0001"']), ], }, 'G002': { 'gene_segment': [], 'T0002': [ pybedtools.create_interval_from_list( ['1', '.', 'exon', '50', '100', '.', '-', '.', 'gene_id: "G001"', 'transcript_id: "T0001"']), ], }, } second_start, _ = xlsites._second_start( read=0, poss=(1, 2, 99, 100), strand='+', chrom=1, segmentation=segmentation, holesize_th=4) self.assertEqual(second_start, 99) second_start, _ = xlsites._second_start( read=0, poss=(99, 100, 199, 200), strand='-', chrom=1, segmentation=segmentation, holesize_th=4) self.assertEqual(second_start, 100) second_start, _ = xlsites._second_start( read=0, poss=(1, 2, 4, 5), strand='-', chrom=1, segmentation=segmentation, holesize_th=4) self.assertEqual(second_start, 2)
def test_pickleable(): interval = pybedtools.create_interval_from_list( ['chr1', '1', '100', 'asdf']) fn = pybedtools.BedTool._tmp() import pickle out = open(fn, 'w') pickle.dump(interval, out) out.close() new_interval = pickle.load(open(fn)) assert str(interval) == str(new_interval) interval = pybedtools.create_interval_from_list(['chr1', '1', '100']) fn = pybedtools.BedTool._tmp() import pickle out = open(fn, 'w') pickle.dump(interval, out) out.close() new_interval = pickle.load(open(fn)) assert str(interval) == str(new_interval) interval = pybedtools.create_interval_from_list( "chr2L . UTR 41 70 0 + . ID=mRNA:xs2:UTR:41-70;Parent=mRNA:xs2;".split( '\t')) fn = pybedtools.BedTool._tmp() import pickle out = open(fn, 'w') pickle.dump(interval, out) out.close() new_interval = pickle.load(open(fn)) assert str(interval) == str(new_interval)
def finder(self, region): s = ['.' for i in self.current_seq] for hit in self.regex_plus.finditer(self.current_seq): start, stop = hit.span() s[start:stop] = hit.group() strand = ' (+)' s[stop:len(strand)] = strand self.intervals.append(pybedtools.create_interval_from_list([ region.chrom, str(region.start + start), str(region.start + stop), hit.group(), '0', '+'])) for hit in self.regex_minus.finditer(self.current_seq): start, stop = hit.span() s[start:stop] = Seq(hit.group()).reverse_complement() strand = ' (-)' s[stop:len(strand)] = strand self.intervals.append(pybedtools.create_interval_from_list([ region.chrom, str(region.start + start), str(region.start + stop), hit.group(), '0', '-'])) yield ''.join(s)
def test_convert_to_mRNA_position_placement(self): """ Makes sure that the placement within a region or list of regions is correct """ return interval = pybedtools.create_interval_from_list("ENSMUSG1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 + 60 60".split()) location_dict = {"ENSMUSG1" : {"strand" : "+", "regions" : [(0,100), ] } } correct_tool = pybedtools.create_interval_from_list("ENSMUSG1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 + 60 60".split()) self.assertEqual(convert_to_mRNA_position(interval, location_dict), correct_tool) interval = pybedtools.create_interval_from_list("ENSMUSG1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 -".split()) location_dict = {"ENSMUSG1" : {"strand" : "-", "regions" : [(0,100), ] } } #individual_fraction, total_fraction correct_tool = pybedtools.create_interval_from_list("ENSMUSG1 40 50 ENSMUSG1_1_83;ENSMUSG1_6_83 0 -".split()) x = convert_to_mRNA_position(interval, location_dict) print x self.assertEqual(x, correct_tool)
def test_RNA_position_placement(self): """ Makes sure that the placement within a region or list of regions is correct """ tool = pybedtools.create_interval_from_list( "chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 + 60 60" .split()) location_dict = { "ENSMUSG1": { "strand": "+", "regions": [ (0, 100), ] } } self.assertEqual(RNA_position(tool, location_dict), (.60, .60)) tool = pybedtools.create_interval_from_list( "chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 60 60" .split()) location_dict = { "ENSMUSG1": { "strand": "-", "regions": [ (0, 100), ] } } #individual_fraction, total_fraction self.assertEqual(RNA_position(tool, location_dict), (.4, .4))
def test_fix_bed6_empytname3(self): feature = create_interval_from_list( ['1', '1', '10', 'b,.,a', '5', '+']) converted = clusters._fix_bed6_emptyname(feature) expected = create_interval_from_list(['1', '1', '10', 'b,a', '5', '+']) self.assertEqual(expected, converted)
def bedtool_from_renamed_twobed_index2(name, stream): """ WARNING THIS IS ONLY GOOD FOR PHASTCON MASK FUNCTION Parameters ---------- name stream Returns ------- """ low_chrom, low_start, low_end, low_name, low_score, low_strand, \ hi_chrom, hi_start, hi_end, hi_name, hi_score, hi_strand = name.split('\t') if stream == 'upstream': if low_strand == '+' and hi_strand == '+': region = pybedtools.create_interval_from_list([ low_chrom, low_start, low_end, low_name, low_score, low_strand ]) else: region = pybedtools.create_interval_from_list( [hi_chrom, hi_start, hi_end, hi_name, hi_score, hi_strand]) elif stream == 'downstream': if low_strand == '-' and hi_strand == '-': region = pybedtools.create_interval_from_list([ low_chrom, low_start, low_end, low_name, low_score, low_strand ]) else: region = pybedtools.create_interval_from_list( [hi_chrom, hi_start, hi_end, hi_name, hi_score, hi_strand]) return region
def create_bed_tool_from_miso_a3ss(miso_annotation, is_alt=True): """ Deprecated function """ if is_alt == True: # format is: # chr2:55764619:55764721:+@chr2:55771074|55771161:55771210:+ ENSG00000163001 # chr17:62502194:62502407:-@chr17:62500960|62500998:62500795:- ENSG00000108654 # chr2:55771074|55771161:55771210:+ # chr1:43830128|43830131:43829995:- chrom, start, end, strand = miso_annotation.split(':') start1, start2 = start.split('|') if (strand == '+'): splice1 = bt.create_interval_from_list( [chrom, int(start1) - 1, start2, '0', '0', strand]) # the middle one splice2 = bt.create_interval_from_list( [chrom, int(start2) - 1, end, '0', '0', strand]) # the downstream one elif (strand == '-'): splice1 = bt.create_interval_from_list( [chrom, int(start1) - 1, start2, '0', '0', strand]) splice2 = bt.create_interval_from_list( [chrom, int(end) - 1, start1, '0', '0', strand]) return splice1, splice2 else: # format is: chr17:80008538:80008640:- chrom, start, end, strand = miso_annotation.split(':') some_bedtool = bt.create_interval_from_list( [chrom, int(start) - 1, end, '0', '0', strand]) return some_bedtool
def string_to_interval(s): """ Convert string of the form "chrom:start-stop" or "chrom:start-stop[strand]" to an interval. Assumes zero-based coords. If it's already an interval, then return it as-is. """ if isinstance(s, basestring): m = coord_re.search(s) if m.group('strand'): return pybedtools.create_interval_from_list([ m.group('chrom'), m.group('start'), m.group('stop'), '.', '0', m.group('strand')]) else: return pybedtools.create_interval_from_list([ m.group('chrom'), m.group('start'), m.group('stop'), ]) return s
def test_convert_to_mRNA_position_placement_split(self): """ Makes sure that lists of regions works for both positive and negative strands """ return tool = pybedtools.create_interval_from_list("ENSMUSG1 125 127 ENSMUSG1_1_83;ENSMUSG1_6_83 0 + 125 125".split()) location_dict = {"ENSMUSG1" : {"strand" : "+", "regions" : [(0, 50), (100, 150), ] } } correct_tool = pybedtools.create_interval_from_list("ENSMUSG1 75 77 ENSMUSG1_1_83;ENSMUSG1_6_83 0 + 125 125".split()) self.assertEqual(convert_to_mRNA_position(tool, location_dict), correct_tool ) tool = pybedtools.create_interval_from_list("ENSMUSG1 25 27 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 25 25".split()) location_dict = {"ENSMUSG1" : {"strand" : "-", "regions" : [(100, 150), (0, 50), ] } } correct_tool = pybedtools.create_interval_from_list("ENSMUSG1 73 75 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 25 25".split()) self.assertEqual(convert_to_mRNA_position(tool, location_dict), correct_tool)
def intersection2gff(intersection): #offset = 9; a = str(intersection).strip().split("\t") if(a[9] == '.'): return create_interval_from_list(a[:9]), None; else: return create_interval_from_list(a[:9]), create_interval_from_list(a[9:]);
def get_bedtools(self): upstream = None downstream = None if self.source == 'twobed': lower_chrom, lower_start, lower_end, \ lower_name, lower_score, lower_strand, \ upper_chrom, upper_start, upper_end, \ upper_name, upper_score, upper_strand = self.annotation.split('\t') if lower_strand == '+' and upper_strand == '+': upstream = bt.create_interval_from_list([ lower_chrom, lower_start, lower_end, lower_name, lower_score, lower_strand ]) downstream = bt.create_interval_from_list([ upper_chrom, upper_start, upper_end, upper_name, upper_score, upper_strand ]) elif lower_strand == '-' and upper_strand == '-': downstream = bt.create_interval_from_list([ lower_chrom, lower_start, lower_end, lower_name, lower_score, lower_strand ]) upstream = bt.create_interval_from_list([ upper_chrom, upper_start, upper_end, upper_name, upper_score, upper_strand ]) else: print("Warning, strand not correct!") return -1 return upstream, downstream
def test_convert_to_mRNA_position_fail(self): """ Various attempts to break RNA position and make sure that error are caught """ return tool = pybedtools.create_interval_from_list( "ENSMUSG1 51 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 10 10" .split()) location_dict = { "ENSMUSG1": { "strand": "-", "regions": [ (100, 150), (25, 50), ] } } self.assertEqual( convert_to_mRNA_position(tool, location_dict).chrom, "none") tool = pybedtools.create_interval_from_list( "ENSMUSG1 51 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 175 175" .split()) self.assertEqual( convert_to_mRNA_position(tool, location_dict).chrom, "none") pybedtools.BedTool("chr1 x y ")
def test_all_hits(): a = pybedtools.example_bedtool('a.bed') assert [a[2], a[3]] == a.all_hits(pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-'])) assert [a[2]] == a.all_hits(pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-']), same_strand=True)
def test_minus_feature_is_reversed_profile(self): feature1 = pybedtools.create_interval_from_list(['chr2L', '0', '20', '.', '.', '+']) feature2 = pybedtools.create_interval_from_list(['chr2L', '0', '20', '.', '.', '-']) x, y = self.m.local_coverage(feature1, fragment_size=5) xm, ym = self.m.local_coverage(feature2, fragment_size=5) pp(list(enumerate(zip(ym, y)))) assert list(ym) == list(y[::-1]) assert list(xm) == list(x)
def test_count_hits(): a = pybedtools.example_bedtool('a.bed') assert len(a.all_hits(pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-']))) == 2 assert len(a.all_hits(pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-']), same_strand=True)) == 1
def _add_biotype_attribute(gene_content): """ Add `biotype` attribute to all intervals in gene_content. biotype attribute is equal to transcript_biotype value if present, else gene_biotype if present else value in column 2 (index 1). The last option can only happen in some of early ensembl releases. Parameters ---------- gene_content_ : dict Intervals in gene separated by transcript id. Returns ------- dict Same gene_content_ object with added `biotype` attributes. """ gene_content = gene_content.copy() # Determine gene biotype: gbiotype = gene_content['gene'].attrs.get('gene_biotype', None) # List to keep track of all possible biotypes in gene: gene_biotypes = [gbiotype] if gbiotype else [] for transcript_id, transcript_intervals in gene_content.items(): if transcript_id == 'gene': continue new_intervals = [] exon = [i for i in transcript_intervals if i[2] in ['CDS', 'ncRNA']][0] gbiotype = exon.attrs.get('gene_biotype', None) tbiotype = exon.attrs.get('transcript_biotype', None) biotype = tbiotype if tbiotype else (gbiotype if gbiotype else exon[1]) gene_biotypes.append(biotype) for interval in transcript_intervals: col8 = interval[8] if interval[8] != '.' else '' new_intervals.append( create_interval_from_list( interval[:8] + [col8 + ' biotype "{}";'.format(biotype)])) gene_content[transcript_id] = new_intervals # Finally, make also gene biotype: a list of all biotypes in gene, # sorted by frequency. Additionally, another sorting is added to sort # by alphabet if counts are equal. biotype = ', '.join([ i[0] for i in sorted(sorted(Counter(gene_biotypes).items()), key=lambda x: x[1], reverse=True) ]) interval = gene_content['gene'] gene_content['gene'] = create_interval_from_list( interval[:8] + [interval[8] + ' biotype "[{}]";'.format(biotype)]) return gene_content
def test_basic(self): seg_level0 = create_interval_from_list(['1', '.', 'CDS', '1', '2', '.', '+', '.', 'gene_name "G0";']) seg_level1 = create_interval_from_list(['1', '.', 'UTR3', '1', '2', '.', '+', '.', 'gene_name "G1";']) seg_level4 = create_interval_from_list(['1', '.', 'intron', '1', '2', '.', '+', '.', 'gene_name "G2";']) self.assertEqual('G0', landmark.get_gene_name(seg_level0, seg_level1)) self.assertEqual('G0', landmark.get_gene_name(seg_level1, seg_level0)) self.assertEqual('G1', landmark.get_gene_name(seg_level1, seg_level4)) with self.assertRaises(ValueError): self.assertEqual('B', landmark.get_gene_name(seg_level0, seg_level0))
def test_all_hits(): a = pybedtools.example_bedtool('a.bed') assert [a[2], a[3]] == a.all_hits( pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-'])) assert [a[2]] == a.all_hits(pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-']), same_strand=True)
def test_minus_feature_is_reversed_profile(self): feature1 = pybedtools.create_interval_from_list( ['chr2L', '0', '20', '.', '.', '+']) feature2 = pybedtools.create_interval_from_list( ['chr2L', '0', '20', '.', '.', '-']) x, y = self.m.local_coverage(feature1, fragment_size=5) xm, ym = self.m.local_coverage(feature2, fragment_size=5) pp(list(enumerate(zip(ym, y)))) assert list(ym) == list(y[::-1]) assert list(xm) == list(x)
def test_any_hits(): a = pybedtools.example_bedtool('a.bed') assert 1 == a.any_hits(pybedtools.create_interval_from_list( ['chr1', '900', '905', '.', '.', '-'])) assert 0 == a.any_hits(pybedtools.create_interval_from_list( ['chr1', '900', '905', '.', '.', '-']), same_strand=True) assert 0 == a.any_hits(pybedtools.create_interval_from_list( ['chr1', '8000', '9000', '.', '.', '-']))
def get_bedtool_4(): """ returns a bedtool containing 2 intervals 2nt and 10nt long within the CDS of ENSG00000188976.6 """ interval1 = pybedtools.create_interval_from_list( ['chr1', '881553', '881555', '.', '0', '+']) interval2 = pybedtools.create_interval_from_list( ['chr1', '881553', '881565', '.', '0', '+']) return pybedtools.BedTool([interval1, interval2])
def get_jx_region_as_interval_eric(row, x, event='se'): """ returns a BedTools interval given an rmats annotation row spanning from the upstream-end to the downstream-start. Parameters ---------- row : pandas.core.series.Series single row of a rMATS file x : basestring name given to the bedtools interval Returns ------- pybedtools.BedTool.Interval """ chrom, strand, _, _, _ = row['annotation'].split('|') if event == 'se' or event == 'mxe' or event == 'ri': low_start, low_end = [int(ex) for ex in row['low_exon'].split('-')] hi_start, hi_end = [int(ex) for ex in row['hi_exon'].split('-')] interval = bt.create_interval_from_list( [chrom, low_end, hi_start, x, '0', strand]) elif event == 'a3ss': flank_start, flank_end = [int(ex) for ex in row['upstream_exon'].split('-')] short_start, short_end = [int(ex) for ex in row['short_exon'].split('-')] if strand == '+': interval = bt.create_interval_from_list( [chrom, flank_end, short_start, x, '0', strand]) else: interval = bt.create_interval_from_list( [chrom, short_end, flank_start, x, '0', strand] ) elif event == 'a5ss': flank_start, flank_end = [int(ex) for ex in row['downstream_exon'].split('-')] short_start, short_end = [int(ex) for ex in row['short_exon'].split('-')] if strand == '+': interval = bt.create_interval_from_list( [chrom, short_end, flank_start, x, '0', strand]) else: interval = bt.create_interval_from_list( [chrom, flank_end, short_start, x, '0', strand] ) return interval
def test_gtf_gff_attrs(): # smoke test. # # this has always worked: gff = ["chr1","fake","mRNA","51", "300",".", "+",".","ID=mRNA1;Parent=gene1;"] gff = pybedtools.create_interval_from_list(gff) gff.attrs # this previously failed because of the "=" in the attr string. gff = ['scaffold_52', 'Cufflinks', 'exon', '5478', '5568', '.', '+', '.', 'gene_id "XLOC_017766"; transcript_id "TCONS_00033979"; exon_number "6"; gene_name "g18412"; oId "PAC:26897502"; nearest_ref "PAC:26897502"; class_code "="; tss_id "TSS21210"; p_id "P18851";'] gff = pybedtools.create_interval_from_list(gff) gff.attrs
def test_count_hits(): a = pybedtools.example_bedtool('a.bed') assert len( a.all_hits( pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-']))) == 2 assert len( a.all_hits(pybedtools.create_interval_from_list( ['chr1', '450', '905', '.', '.', '-']), same_strand=True)) == 1
def test_interval_index(): """ supplement to the more general test in test_cbedtools.IntervalTest.testGetItemNegative """ iv = pybedtools.create_interval_from_list('chr21 9719768 9721892 ALR/Alpha 1004 +'.split()) assert iv[-1] == '+' assert iv[2:-1] == ['9721892', 'ALR/Alpha', '1004'] iv = pybedtools.create_interval_from_list( ['chr1', 'ucb', 'gene', '465', '805', '.', '+', '.', 'ID=thaliana_1_465_805;match=scaffold_801404.1;rname=thaliana_1_465_805']) print iv[4:-3] assert iv[4:-3] == ['805', '.']
def _create_window(self): istart = self.start # interval start istop = self.start + self.size # interval stop while istop < self.stop and istart < self.stop: window = map(str, [self.chrom, istart, istop]) window = pybedtools.create_interval_from_list(window) yield window istart += self.slide istop += self.slide window = map(str, [self.chrom, istart, self.stop]) window = pybedtools.create_interval_from_list(window) yield window
def test_any_hits(): a = pybedtools.example_bedtool('a.bed') assert 1 == a.any_hits( pybedtools.create_interval_from_list( ['chr1', '900', '905', '.', '.', '-'])) assert 0 == a.any_hits(pybedtools.create_interval_from_list( ['chr1', '900', '905', '.', '.', '-']), same_strand=True) assert 0 == a.any_hits( pybedtools.create_interval_from_list( ['chr1', '8000', '9000', '.', '.', '-']))
def values(self, chrom, start, end, strand, flatten=False): """ Parameters ---------- chrom : basestring (eg. chr1) start : int 0-based start (first position in chromosome is 0) end : int 1-based end (last position is not included) strand : str either '+' or '-' flatten : bool in the case where multiple peaks overlap a region, scores will be summed over these regions. If flatten = True, scores will be the minimum of the multiple peaks. Returns ------- densities : list values corresponding to density over specified positions. """ # Get all overlapping values region = pybedtools.create_interval_from_list( [chrom, str(start), str(end), '.', '0', strand]) series = pd.Series(data=0, index=range(len(region))) try: overlapped_peaks = self.peaks.entries(chrom, start, end, strand) except RuntimeError as e: print( "weird entry (this can happen if the peak bb does not contain this chromosome, or if the region is invalid)" ": {}:{}-{}:{}".format(chrom, start, end, strand), e) return series if overlapped_peaks is None: return series else: for p in overlapped_peaks: bed_list = [chrom, str(p[0]), str(p[1])] + p[2].split('\t') if bed_list[5] == strand: peak = pybedtools.create_interval_from_list(bed_list) if flatten: print('not implemented or important yet' ) # TODO: implement flatten else: series += intervals.get_overlap(peak, region) return series
def test_split_prox_dist_1(): print("Tests the core functionality of assigning proximal and distal " "intron spaces. Should not return any distal introns based on the " "specified distance.") length = 10 midpoint = 5 intron_interval = pybedtools.create_interval_from_list( ['chr1', '0', str(length), 'intron', '0', '+']) proxdist_dict = af.get_proxdist_from_intron(interval=intron_interval, distance=midpoint) assert 'prox' in proxdist_dict.keys() # found a prox intron region. assert len(proxdist_dict['dist']) == 0 # found no dist intron assert len(proxdist_dict['prox']) == 1 # just found one prox intron assert proxdist_dict['prox'][0] == pybedtools.create_interval_from_list( ['chr1', '0', str(length), 'proxintron5', '0', '+'])
def _iter_bed_dict(bed, val_index=None): """Iterate through dict object.""" if val_index is not None: for (chrome, strand), by_pos in bed.items(): for pos, val in by_pos.items(): val = val[val_index] yield pybedtools.create_interval_from_list( [chrome, pos, pos + 1, '.', _f2s(val), strand] ) else: for (chrome, strand), by_pos in bed.items(): for pos, val in by_pos.items(): yield pybedtools.create_interval_from_list( [chrome, pos, pos + 1, '.', _f2s(val), strand] )
def test_interval_index(): """ supplement to the more general test in test_cbedtools.IntervalTest.testGetItemNegative """ iv = pybedtools.create_interval_from_list( 'chr21 9719768 9721892 ALR/Alpha 1004 +'.split()) assert iv[-1] == '+' assert iv[2:-1] == ['9721892', 'ALR/Alpha', '1004'] iv = pybedtools.create_interval_from_list([ 'chr1', 'ucb', 'gene', '465', '805', '.', '+', '.', 'ID=thaliana_1_465_805;match=scaffold_801404.1;rname=thaliana_1_465_805' ]) print iv[4:-3] assert iv[4:-3] == ['805', '.']
def build_transcript_data_gtf_as_structure(species, pre_mrna): """ gtf_file - gtf file generated from AS_STRUCTURE_gtf ipython notebook pre_mrna - if true uses pre mRNA length instead of mRNA length """ bedtoolintervals = [] x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff") gtf_file = pybedtools.BedTool(x) for gene in gtf_file: effective_length = gene.attrs[ 'premrna_length'] if pre_mrna else gene.attrs['mrna_length'] attrs = "gene_id=%s;" % (gene.attrs['gene_id']) if "transcript_ids" in gene.attrs: attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids']) attrs += "effective_length=%s" % (str(effective_length)) bedtoolintervals.append( pybedtools.create_interval_from_list( map(str, [ gene['chrom'], "AS_STRUCTURE", "mRNA", str(gene.start + 1), str(gene.stop + 1), "0", gene['strand'], ".", attrs ]))) return pybedtools.BedTool(bedtoolintervals)
def test_plus_feature_minus_reads(self): # the plus-strand read from 10-14 should not appear -- so all zeros feature = pybedtools.create_interval_from_list(['chr2L', '0', '20', '.', '.', '+']) x, y = self.m.local_coverage(feature, read_strand='-', fragment_size=5) pp(zip(x, y)) assert zip(x, y) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0)]
def test_fragmentsize(self): feature = pybedtools.create_interval_from_list( ['chr2L', '0', '25', '.', '.', '+']) x, y = self.m.local_coverage(feature, fragment_size=10) pp(zip(x, y)) assert zip(x, y) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0)]
def test_identity_bins(self): # Same number of bins as bp in the feature feature = pybedtools.create_interval_from_list(['chr2L', '0', '20', '.', '.', '+']) x, y = self.m.local_coverage(feature, fragment_size=10, bins=20) pp(zip(x, y)) assert np.allclose( np.array(zip(x, y)), np.array( [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]))
def test_shiftwidth_and_fragmentsize(self): # # Reads on opposite strands shift oppositely... # # ||||| original # # +||||| minus strand read, leftshift 1 and additional to left (3') # |||||+ plus strand read, rightshift 1 and additional to right (3') # 111222111 feature = pybedtools.create_interval_from_list(['chr2L', '60', '80', '.', '.', '+']) x, y = self.m.local_coverage(feature, fragment_size=6, shift_width=1) pp(zip(x, y)) assert zip(x, y) == \ [(60, 0), (61, 0), (62, 0), (63, 0), (64, 0), (65, 0), (66, 0), (67, 0), (68, 1), (69, 1), (70, 1), (71, 2), (72, 2), (73, 2), (74, 1), (75, 1), (76, 1), (77, 0), (78, 0), (79, 0)]
def test_plus_feature(self): # first make one where fragments are exactly as long as reads feature = pybedtools.create_interval_from_list(['chr2L', '0', '20', '.', '.', '+']) x, y = self.m.local_coverage(feature, fragment_size=5) pp(zip(x,y)) assert zip(x, y) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0)]
def test_minus_feature(self): # minus strand flips the profile feature = pybedtools.create_interval_from_list(['chr2L', '0', '20', '.', '.', '-']) xm, ym = self.m.local_coverage(feature, fragment_size=5) pp(zip(xm,ym)) assert zip(xm, ym) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0)]
def redefine_regions(df): """ Turns overlapping regions into distinct nonoverlapping regions. :param df: pandas.Dataframe() The to_dataframe() result of bedtools cluster call :return BedTool(non-overlapping interval): pybedtools.BedTool() The BedTool of nonoverlapping intervals. """ positions = [] intervals = [] for col, row in df.iterrows(): chrom = row['chrom'] strand = row['strand'] positions.append(row['start']) positions.append(row['end']) positions = sorted(set(positions)) for p in range(0, len(positions[:-1])): intervals.append( bt.create_interval_from_list([ chrom, str(positions[p]), str(positions[p + 1]), 'name', '0', strand ])) return bt.BedTool(intervals)
def output_bed_coords_from_fasta(fasta_fname, bed_fname): """ Output event coordinates from a FASTA file into a BED format. Assumes FASTA entry is of the form: >part_id:coords:entry_type """ print "Converting FASTA %s to BED %s" %(fasta_fname, bed_fname) total_len = 0 with open(bed_fname, "w") as bed_out: for fasta_entry in fastx_utils.get_fastx_entries(fasta_fname): fasta_name, fasta_seq = fasta_entry # Assume FASTA entry coordinates are in GFF format. # Convert them to BED if ";" not in fasta_name: raise Exception, "Malformed FASTA entry name: %s" %(fasta_name) gff_coords = fasta_name.split(";")[1] chrom, start, end, strand = parse_gff_coords(gff_coords) # Convert start to BED by subtracting one start = start - 1 bed_entry = \ pybedtools.create_interval_from_list(map(str, [chrom, start, end, gff_coords, "1", strand])) bed_out.write("%s" %(str(bed_entry))) # Accumulate total length of FASTA seqs total_len += len(fasta_seq) return total_len
def get_bedtool_iter(): for gene_num, gene_entry in table.iterrows(): chrom = gene_entry["chrom"] start = int(gene_entry["txStart"]) + 1 end = int(gene_entry["txEnd"]) strand = gene_entry["strand"] # Annotation fields name2 = gene_entry["name2"] if pandas.isnull(name2): name2 = "NA" refseq_id = gene_entry["refseq"] if pandas.isnull(refseq_id): refseq_id = "NA" gene_symbol = gene_entry[gene_symbol_col] if pandas.isnull(gene_symbol): gene_symbol = "NA" attributes = \ "ID=%s;ensg_id=%s;refseq_id=%s;gsymbol=%s;" \ %(name2, name2, refseq_id, gene_symbol) # Convert table to BedTool gff_entry = [chrom, "genes_table", "gene", str(start), str(end), ".", strand, ".", attributes] gff_interval = \ pybedtools.create_interval_from_list(gff_entry) yield gff_interval
def genotype_intervals(intervals_file=None, bam=None, workdir=None, window=GT_WINDOW, isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD, normal_frac_threshold=GT_NORMAL_FRAC): func_logger = logging.getLogger("%s-%s" % (genotype_intervals.__name__, multiprocessing.current_process())) if workdir and not os.path.isdir(workdir): os.makedirs(workdir) pybedtools.set_tempdir(workdir) genotyped_intervals = [] start_time = time.time() isize_min = max(0, isize_mean - 3 * isize_sd) isize_max = isize_mean + 3 * isize_sd try: bam_handle = pysam.Samfile(bam, "rb") for interval in pybedtools.BedTool(intervals_file): chrom, start, end, sv_type, svlen = parse_interval(interval) genotype = genotype_interval(chrom, start, end, sv_type, svlen, bam_handle, isize_min, isize_max, window, normal_frac_threshold) fields = interval.fields + [genotype] genotyped_intervals.append(pybedtools.create_interval_from_list(fields)) bedtool = pybedtools.BedTool(genotyped_intervals).moveto(os.path.join(workdir, "genotyped.bed")) except Exception as e: func_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e func_logger.info("Genotyped %d intervals in %g minutes" % (len(genotyped_intervals), (time.time() - start_time)/60.0)) return bedtool.fn
def test_minus_feature(self): # minus strand flips the profile feature = pybedtools.create_interval_from_list( ['chr2L', '0', '20', '.', '.', '-']) xm, ym = self.m.local_coverage(feature, fragment_size=5) pp(zip(xm, ym)) assert zip(xm, ym) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0)]
def build_transcript_data_gtf_as_structure(species, pre_mrna): """ gtf_file - gtf file generated from AS_STRUCTURE_gtf ipython notebook pre_mrna - if true uses pre mRNA length instead of mRNA length """ results = [] x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff") gtf_file = pybedtools.BedTool(x) for gene in gtf_file: effective_length = gene.attrs['premrna_length'] if pre_mrna else gene.attrs['mrna_length'] attrs = "gene_id=%s;" % (gene.attrs['gene_id']) if "transcript_ids" in gene.attrs: attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids']) attrs += "effective_length=%s" % (str(effective_length)) results.append(pybedtools.create_interval_from_list(map(str, [gene['chrom'], "AS_STRUCTURE", "mRNA", str(gene.start + 1), str(gene.stop + 1), "0", gene['strand'], ".", attrs ]))) return pybedtools.BedTool(results)
def __getitem__(self, key): chrom = key.chrom start = key.start stop = key.end try: bx_intervals = self.fileobj.get(chrom, start, stop) except StrandFormatError: raise NotImplementedError(dedent( """ It appears you have a version of bx-python where bigBed files are temporarily unsupported due to recent changes in the bx-python dependency. In the meantime, please convert bigBed to BAM like this: bigBedToBed {0} tmp.bed bedtools bedtobam -i tmp.bed > {0}.bam and create a genomic signal object using this {0}.bam file. """.format(self.fn))) if bx_intervals is None: raise StopIteration for i in bx_intervals: interval = pybedtools.create_interval_from_list(i.fields) interval.file_type = 'bed' yield interval
def primer_to_gff(name, primer, tag, seq_name, seq_start, strand, **kwargs): """Create a gff feature from partially parsed primer3 results. """ pos, len = map(int, primer['position'].split(',')) # transfer the calculated values to attributes # skip the fields used elsewhere in gff at = pybedtools.Attributes(' ') for k, v in primer.iteritems(): if k == 'position': continue at[k] = v.replace(';', '%3B') at['ID'] = name # pass all optional params to attributes at.update({k:str(v) for k, v in kwargs.iteritems()}) # primer3 provides the coordinates of right primer with the # pos pointing to the last base if strand == '-': start = seq_start + pos - len + 2 end = seq_start + pos + 1 else: start = seq_start + pos + 1 end = seq_start + pos + len gflist = [seq_name, 'design-primers', tag, str(start), str(end), primer['PENALTY'], strand, '.', str(at)] return pybedtools.create_interval_from_list(gflist)
def truncator(feature): """ Convert a feature of any format into a BED3 format. """ return pybedtools.create_interval_from_list( [feature.chrom, str(feature.start), str(feature.stop)])
def create_bedtools(features, keys, by_transcript=False): """ Given a list of features and chr19_keys dictionary, create a bedtool containing intervals of features whose name is specified using chr19_keys['gene_id'] :param features: list list of gffutils features (1-based) for which to convert to bedtool intervals :param keys: dict a set of chr19_keys and values which helps translate different GTF/GFF nomenclatures (ie. 'cds' :return: """ intervals = [] key = 'transcript_id' if by_transcript else 'gene_id' progress = trange(len(features), desc='creating bedtools.') for feature in features: for i in range(len(feature.attributes[keys[key]])): interval = pybedtools.create_interval_from_list([ feature.seqid, str(feature.start - 1), str(feature.end), feature.attributes[keys[key]][i], '0', feature.strand ]) intervals.append(interval) progress.update(1) bedtool = pybedtools.BedTool(intervals) return bedtool
def test_shiftwidth_of_1_plus_only(self): # The plus-strand read should shift right by 1 feature = pybedtools.create_interval_from_list(['chr2L', '0', '20', '.', '.', '+']) x, y = self.m.local_coverage(feature, fragment_size=5, shift_width=1) pp(zip(x, y)) assert zip(x, y) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 0), (17, 0), (18, 0), (19, 0)]
def merge_peaks_count(trimmed_windows, read_pos_weights): # create BED intervals bed_intervals = [] for wstart, wend in trimmed_windows: bed_a = ['chrFAKE', str(wstart-1), str(wend)] bed_intervals.append(pybedtools.create_interval_from_list(bed_a)) bedtool = pybedtools.BedTool(bed_intervals) # merge BED intervals bedtool_merge = bedtool.merge(stream=True) # recount peaks read_positions = [pos for (pos,w) in read_pos_weights] peaks = [] for bed_interval in bedtool_merge.features(): pstart = bed_interval.start+1 pend = bed_interval.end reads_start_i = bisect_left(read_positions, pstart) reads_end_i = bisect_right(read_positions, pend) # TODO: Count using the weights #read_count = reads_end_i - reads_start_i read_count = sum([read_pos_weights[i][1] for i in range(reads_start_i,reads_end_i)]) peaks.append((pstart, pend, read_count)) return peaks
def test_shift_of_0(self): feature = pybedtools.create_interval_from_list(['chr2L', '60', '80', '.', '.', '+']) x, y = self.m.local_coverage(feature, fragment_size=5, shift_width=0) pp(zip(x,y)) assert zip(x, y) == \ [(60, 0), (61, 0), (62, 0), (63, 0), (64, 0), (65, 0), (66, 0), (67, 0), (68, 0), (69, 0), (70, 2), (71, 2), (72, 2), (73, 2), (74, 2), (75, 0), (76, 0), (77, 0), (78, 0), (79, 0)]
def test_fragmentsize(self): feature = pybedtools.create_interval_from_list(['chr2L', '0', '25', '.', '.', '+']) x, y = self.m.local_coverage(feature, fragment_size=10) pp(zip(x, y)) assert zip(x, y) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0)]
def test_RNA_position_fail(self): """ Various attempts to break RNA position and make sure that error are caught """ tool = pybedtools.create_interval_from_list("chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 10 10".split()) location_dict = {"ENSMUSG1" : {"strand" : "-", "regions" : [(100, 150), (25,50), ] } } self.assertEqual(RNA_position(tool, location_dict), (None, None)) tool = pybedtools.create_interval_from_list("chr1 50 60 ENSMUSG1_1_83;ENSMUSG1_6_83 0 - 175 175".split()) self.assertEqual(RNA_position(tool, location_dict), (None, None))
def test_plus_feature_minus_reads(self): # the plus-strand read from 10-14 should not appear -- so all zeros feature = pybedtools.create_interval_from_list( ['chr2L', '0', '20', '.', '.', '+']) x, y = self.m.local_coverage(feature, read_strand='-', fragment_size=5) pp(zip(x, y)) assert zip(x, y) == \ [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0)]