def readGtfFile(filename, filter_feature=None): """ Read a GTF/GFF file. filename: name of file filter_feature: name of feature to be selected, all others ignored; None means anything """ f = open(filename) row = 0 acceptHeaderRows = 1 headerRow = None chroms = dict() for line in f: row += 1 words = line.strip().split('\t') if len(words) == 0: continue # ignore empty lines if words[0].strip().startswith('#'): continue # comment if words[0].strip().startswith('browser'): continue # ignore if words[0].strip().startswith('track'): continue # ignore try: seqname = words[0] source = words[1] feature = words[2] if filter_feature and filter_feature != feature: continue start = int(words[3]) end = int(words[4]) score = None if words[5].isnumeric(): score = int(words[5]) strand = '.' if words[6] == '+' or words[6] == '-': strand = words[6] frame = None if words[7].isdigit(): frame = int(words[7]) group = None if len(words) > 8: group = words[8] entry = GtfEntry(seqname, start, end, feature, score, source, strand, frame, group) # check if the chromosome has been seen before tree = chroms.get(seqname) if not tree: tree = ival.IntervalTree() chroms[seqname] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.start, entry.end) tree.put(iv, entry) except RuntimeError as e: if not acceptHeaderRows: raise RuntimeError('Error in GTF/GFF file at row %d (%s)' % (row, e.strerror)) else: headerRow = words acceptHeaderRows -= 1 # count down the number of header rows that can occur f.close() return chroms
def __init__(self, entries, filter_feature=None): """ Create a GtfFile instance. :param entries: an iterable of entries or a filename """ if isinstance(entries, str): # filename self.chroms = readGtfFile(entries, filter_feature) else: self.chroms = dict() for entry in entries: # check if the chromosome has been seen before tree = self.chroms.get(entry.chrom) if not tree: tree = ival.IntervalTree() self.chroms[entry.chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.start, entry.end) tree.put(iv, entry)
def __init__(self, entries, format='Limited'): """ Create a BedFile instance. :param entries: an iterable of entries or a filename :param format: the format of the BED file """ self.format = format if isinstance(entries, str): # filename self.chroms = readBedFile(entries, format) else: self.chroms = dict() for entry in entries: # check if the chromosome has been seen before tree = self.chroms.get(entry.chrom) if not tree: tree = ival.IntervalTree() self.chroms[entry.chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry)
def readBedGraphFile(filename,chr='chr1'): """ Read a Bedgraph file - suitable for large files. """ f = open(filename) row = 0 acceptHeaderRows = 1 headerRow = None chroms = dict() found = False for line in f: row += 1 words = line.strip().split() if len(words) == 0: continue # ignore empty lines try: chrom = str(words[0]) if chrom == chr: if not found: # found correct chromosome tree = ival.IntervalTree() chroms[chrom] = tree found = True else: chromStart = int(words[1]) chromEnd = int(words[2]) entry = BedEntry(chrom, chromStart, chromEnd) entry.addOption(score = float(words[3])) # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry) elif found: # no more entries break except RuntimeError as e: if not acceptHeaderRows: raise RuntimeError('Error in BED file at row %d (%s)' % (row, e.strerror)) else: headerRow = words acceptHeaderRows -= 1 # count down the number of header rows that can occur f.close() return chroms
def addBlock(self, relative_start, size): if not self.blocks: self.blocks = ival.IntervalTree() self.blocks.put( ival.Interval(self.chromStart + relative_start, self.chromStart + relative_start + size))
def readBedFile(filename, format='Limited'): """ Read a BED file. format: specifies the format of the file, "Limited", e.g. chr22 1000 5000 chr22 2000 6000 "Optional", e.g. track name=pairedReads description="Clone Paired Reads" useScore=1 chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512 chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601 ... (also handles the Limited + score, and BED6 format) "Peaks", e.g. chr1 569780 569930 . 0 . 19 6.07811 -1 -1 chr1 713300 713450 . 0 . 54 49.1167 -1 -1 "Strand", e.g. chr4 185772359 185772424 - chr18 20513381 20513401 + also supports a 5th label field chr5 20611949 20611949 + ENSG00000251629_20611949 chr3 42187863 42187863 - ENSG00000234562_42187863 "Summit", e.g. # d = 130 chr start end length summit tags -10*log10(pvalue) fold_enrichment FDR(%) chr1 8250 8671 422 286 46 145.84 11.68 0.51 chr1 36382 36984 603 405 46 315.23 27.05 0.24 "CCAT", e.g. chr8 94747805 94747070 94749250 525 3 21.519196 0.002000 chr17 55277895 55277070 55279280 560 18 21.283333 0.002000 "Cropped", e.g. chr1 851602 10 chr1 921184 18 chr1 931838 9 """ f = open(filename) row = 0 acceptHeaderRows = 1 headerRow = None chroms = dict() for line in f: row += 1 words = line.strip().split() if len(words) == 0: continue # ignore empty lines if words[0].strip().startswith('#'): continue # comment if words[0].strip().startswith('browser'): continue # ignore if words[0].strip().startswith('track'): continue # ignore try: chrom = words[0] if format.lower().startswith('ccat'): chromStart = int(words[2]) chromEnd = int(words[3]) else: # all other standard BED formats chromStart = int(words[1]) chromEnd = int(words[2]) entry = BedEntry(chrom, chromStart, chromEnd) if format.lower().startswith('opt') or format.lower().startswith( 'bed12'): if len(words) >= 12: entry.addOption(name=words[3], score=float(words[4]), strand=words[5], thickStart=int(words[6]), thickEnd=int(words[7]), itemRgb=words[8], blockCount=int(words[9]), blockSizes=words[10], blockStarts=words[11]) elif len(words) >= 9: entry.addOption(name=words[3], score=float(words[4]), strand=words[5], thickStart=int(words[6]), thickEnd=int(words[7]), itemRgb=words[8]) elif len(words) >= 6: entry.addOption(name=words[3], score=float(words[4]), strand=words[5]) elif len(words) >= 5: entry.addOption(name=words[3], score=float(words[4])) elif len(words) >= 4: entry.addOption(name=words[3]) else: entry.addOption(name='.', score=int(words[3]), strand='.') elif format.lower().startswith('bed6'): entry.addOption(name=words[3], score=float(words[4]), strand=words[5]) elif format.lower().startswith('strand'): if len(words) >= 4: # properly formatted entry.addOption(strand=words[3]) if len(words) >= 5: entry.addOption(name=words[4]) elif format.lower().startswith('peak'): if len(words) >= 10: # narrowpeaks entry.addOption(name=words[3], score=int(words[4]), strand=words[5], signalValue=float(words[6]), pValue=float(words[7]), qValue=float(words[8]), peak=int(words[9])) else: # broadpeaks entry.addOption(name=words[3], score=int(words[4]), strand=words[5], signalValue=float(words[6]), pValue=float(words[7]), qValue=float(words[8])) elif format.lower().startswith('summit'): if len(words) >= 9: entry.addOption(summit=int(words[4]), tags=int(words[5]), pValue=float(words[6]), fold=float(words[7]), fdr=float(words[8])) else: entry.addOption(summit=int(words[4]), tags=int(words[5]), pValue=float(words[6]), fold=float(words[7])) elif format.lower().startswith('ccat'): entry.addOption(summit=int(words[1]) - entry.chromStart, tags=int(words[4]), bg=int(words[5]), zscore=float(words[6]), fdr=float(words[7]), name='.', score=int(words[4]), strand='.') elif format.lower().startswith('crop'): entry.addOption(score=int(words[2]), name='.', strand='.') entry.chromEnd = entry.chromStart + 1 # check if the chromosome has been seen before tree = chroms.get(chrom) if not tree: tree = ival.IntervalTree() chroms[chrom] = tree # put the entry in the interval tree for the appropriate chromosome iv = ival.Interval(entry.chromStart, entry.chromEnd) tree.put(iv, entry) except RuntimeError as e: if not acceptHeaderRows: raise RuntimeError('Error in BED file at row %d (%s)' % (row, e.strerror)) else: headerRow = words acceptHeaderRows -= 1 # count down the number of header rows that can occur f.close() return chroms
print(next(g)) cnt = 0 collect = [] for entry in bf: cnt += 1 print(str(cnt) + '\t' + str(entry)) collect.append(entry) if cnt == 7: for b in entry: print('\t', b) if cnt == 10: break writeBedFile(collect, '/Users/mikael/Desktop/test.bed') bf2 = BedFile('/Users/mikael/Desktop/test.bed', 'opt') q = ival.Interval(3805000, 3806000) t2 = ival.IntervalTree() t2.put(q, "blah") for entry in bf2: if entry.isBlockOverlap(q): print('Found:', entry) tree = entry.getBlocks() t2.putAll(tree) for t in t2: print(t) entry1 = BedEntry('chrX', 3858266, 3858530) print(entry1 in bf) entry2 = BedEntry('chrX', 10047550, 10067694) for x in bf.getOverlap(entry2): print(x) entry3 = BedEntry('chr9', 102699903, 102700167)