示例#1
0
def readGtfFile(filename, filter_feature=None):
    """ Read a GTF/GFF file.
    filename: name of file
    filter_feature: name of feature to be selected, all others ignored; None means anything
    """
    f = open(filename)
    row = 0
    acceptHeaderRows = 1
    headerRow = None
    chroms = dict()
    for line in f:
        row += 1
        words = line.strip().split('\t')
        if len(words) == 0:
            continue  # ignore empty lines
        if words[0].strip().startswith('#'):
            continue  # comment
        if words[0].strip().startswith('browser'):
            continue  # ignore
        if words[0].strip().startswith('track'):
            continue  # ignore
        try:
            seqname = words[0]
            source = words[1]
            feature = words[2]
            if filter_feature and filter_feature != feature:
                continue
            start = int(words[3])
            end = int(words[4])
            score = None
            if words[5].isnumeric():
                score = int(words[5])
            strand = '.'
            if words[6] == '+' or words[6] == '-':
                strand = words[6]
            frame = None
            if words[7].isdigit():
                frame = int(words[7])
            group = None
            if len(words) > 8:
                group = words[8]
            entry = GtfEntry(seqname, start, end, feature, score, source,
                             strand, frame, group)
            # check if the chromosome has been seen before
            tree = chroms.get(seqname)
            if not tree:
                tree = ival.IntervalTree()
                chroms[seqname] = tree
            # put the entry in the interval tree for the appropriate chromosome
            iv = ival.Interval(entry.start, entry.end)
            tree.put(iv, entry)
        except RuntimeError as e:
            if not acceptHeaderRows:
                raise RuntimeError('Error in GTF/GFF file at row %d (%s)' %
                                   (row, e.strerror))
            else:
                headerRow = words
                acceptHeaderRows -= 1  # count down the number of header rows that can occur
    f.close()
    return chroms
示例#2
0
 def __contains__(self, item):
     if isinstance(item, GtfEntry):
         tree = self.chroms.get(item.chrom)
         if tree == None: return False
         else: return ival.Interval(item.start, item.end) in tree
     else:
         return False
示例#3
0
 def getClosest(self, item):
     if isinstance(item, GtfEntry):
         tree = self.chroms.get(item.chrom)
         if tree == None: return None
         else:
             iv = ival.Interval(item.start, item.end)
             node = tree.closest(iv)
             if node != None: return node.values
             else: return None
     else: return None
示例#4
0
 def getOverlap(self, item):
     if isinstance(item, BedEntry):
         tree = self.chroms.get(item.chrom)
         if tree == None: return None
         else:
             iv = ival.Interval(item.chromStart, item.chromEnd)
             res = tree.isectall(iv)
             ret = []
             for r in res:
                 ret.extend(r.values)
             return ret
     else: return None
示例#5
0
 def __init__(self, entries, filter_feature=None):
     """
     Create a GtfFile instance.
     :param entries: an iterable of entries or a filename
     """
     if isinstance(entries, str):  # filename
         self.chroms = readGtfFile(entries, filter_feature)
     else:
         self.chroms = dict()
         for entry in entries:
             # check if the chromosome has been seen before
             tree = self.chroms.get(entry.chrom)
             if not tree:
                 tree = ival.IntervalTree()
                 self.chroms[entry.chrom] = tree
             # put the entry in the interval tree for the appropriate chromosome
             iv = ival.Interval(entry.start, entry.end)
             tree.put(iv, entry)
示例#6
0
 def __init__(self, entries, format='Limited'):
     """
     Create a BedFile instance.
     :param entries: an iterable of entries or a filename
     :param format: the format of the BED file
     """
     self.format = format
     if isinstance(entries, str):  # filename
         self.chroms = readBedFile(entries, format)
     else:
         self.chroms = dict()
         for entry in entries:
             # check if the chromosome has been seen before
             tree = self.chroms.get(entry.chrom)
             if not tree:
                 tree = ival.IntervalTree()
                 self.chroms[entry.chrom] = tree
             # put the entry in the interval tree for the appropriate chromosome
             iv = ival.Interval(entry.chromStart, entry.chromEnd)
             tree.put(iv, entry)
示例#7
0
def readBedGraphFile(filename,chr='chr1'):
    """ Read a Bedgraph file - suitable for large files.
    """
    f = open(filename)
    row = 0
    acceptHeaderRows = 1
    headerRow = None
    chroms = dict()
    found = False
    for line in f:
        row += 1
        words = line.strip().split()
        if len(words) == 0:
            continue # ignore empty lines
        try:
            chrom = str(words[0])
            if chrom == chr:
                if not found: # found correct chromosome
                    tree = ival.IntervalTree()
                    chroms[chrom] = tree
                    found = True
                else:
                    chromStart = int(words[1])
                    chromEnd = int(words[2])
                    entry = BedEntry(chrom, chromStart, chromEnd)
                    entry.addOption(score = float(words[3]))
                    # put the entry in the interval tree for the appropriate chromosome
                    iv = ival.Interval(entry.chromStart, entry.chromEnd)
                    tree.put(iv, entry)
            elif found: # no more entries
                break

        except RuntimeError as e:
            if not acceptHeaderRows:
                raise RuntimeError('Error in BED file at row %d (%s)' % (row, e.strerror))
            else:
                headerRow = words
                acceptHeaderRows -= 1 # count down the number of header rows that can occur
    f.close()
    return chroms
示例#8
0
 def getInterval(self):
     return ival.Interval(self.start, self.end)
示例#9
0
 def addBlock(self, relative_start, size):
     if not self.blocks:
         self.blocks = ival.IntervalTree()
     self.blocks.put(
         ival.Interval(self.chromStart + relative_start,
                       self.chromStart + relative_start + size))
示例#10
0
def readBedFile(filename, format='Limited'):
    """ Read a BED file.
        format: specifies the format of the file,
        "Limited", e.g.
            chr22 1000 5000
            chr22 2000 6000
        "Optional", e.g.
            track name=pairedReads description="Clone Paired Reads" useScore=1
            chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
            chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
            ...
            (also handles the Limited + score, and BED6 format)
        "Peaks", e.g.
            chr1    569780    569930    .    0    .    19    6.07811    -1    -1
            chr1    713300    713450    .    0    .    54    49.1167    -1    -1
        "Strand", e.g.
            chr4    185772359    185772424    -
            chr18    20513381    20513401    +
        also supports a 5th label field
            chr5    20611949        20611949        +       ENSG00000251629_20611949
            chr3    42187863        42187863        -       ENSG00000234562_42187863
        "Summit", e.g.
            # d = 130
            chr      start    end   length summit  tags -10*log10(pvalue)    fold_enrichment    FDR(%)
            chr1     8250     8671    422    286    46    145.84    11.68    0.51
            chr1    36382    36984    603    405    46    315.23    27.05    0.24
        "CCAT", e.g.
            chr8    94747805    94747070    94749250    525     3    21.519196    0.002000
            chr17   55277895    55277070    55279280    560    18    21.283333    0.002000
        "Cropped", e.g.
            chr1    851602    10
            chr1    921184    18
            chr1    931838    9
    """
    f = open(filename)
    row = 0
    acceptHeaderRows = 1
    headerRow = None
    chroms = dict()
    for line in f:
        row += 1
        words = line.strip().split()
        if len(words) == 0:
            continue  # ignore empty lines
        if words[0].strip().startswith('#'):
            continue  # comment
        if words[0].strip().startswith('browser'):
            continue  # ignore
        if words[0].strip().startswith('track'):
            continue  # ignore
        try:
            chrom = words[0]
            if format.lower().startswith('ccat'):
                chromStart = int(words[2])
                chromEnd = int(words[3])
            else:  # all other standard BED formats
                chromStart = int(words[1])
                chromEnd = int(words[2])
            entry = BedEntry(chrom, chromStart, chromEnd)
            if format.lower().startswith('opt') or format.lower().startswith(
                    'bed12'):
                if len(words) >= 12:
                    entry.addOption(name=words[3],
                                    score=float(words[4]),
                                    strand=words[5],
                                    thickStart=int(words[6]),
                                    thickEnd=int(words[7]),
                                    itemRgb=words[8],
                                    blockCount=int(words[9]),
                                    blockSizes=words[10],
                                    blockStarts=words[11])
                elif len(words) >= 9:
                    entry.addOption(name=words[3],
                                    score=float(words[4]),
                                    strand=words[5],
                                    thickStart=int(words[6]),
                                    thickEnd=int(words[7]),
                                    itemRgb=words[8])
                elif len(words) >= 6:
                    entry.addOption(name=words[3],
                                    score=float(words[4]),
                                    strand=words[5])
                elif len(words) >= 5:
                    entry.addOption(name=words[3], score=float(words[4]))
                elif len(words) >= 4:
                    entry.addOption(name=words[3])
                else:
                    entry.addOption(name='.', score=int(words[3]), strand='.')
            elif format.lower().startswith('bed6'):
                entry.addOption(name=words[3],
                                score=float(words[4]),
                                strand=words[5])
            elif format.lower().startswith('strand'):
                if len(words) >= 4:  # properly formatted
                    entry.addOption(strand=words[3])
                if len(words) >= 5:
                    entry.addOption(name=words[4])
            elif format.lower().startswith('peak'):
                if len(words) >= 10:  # narrowpeaks
                    entry.addOption(name=words[3],
                                    score=int(words[4]),
                                    strand=words[5],
                                    signalValue=float(words[6]),
                                    pValue=float(words[7]),
                                    qValue=float(words[8]),
                                    peak=int(words[9]))
                else:  # broadpeaks
                    entry.addOption(name=words[3],
                                    score=int(words[4]),
                                    strand=words[5],
                                    signalValue=float(words[6]),
                                    pValue=float(words[7]),
                                    qValue=float(words[8]))
            elif format.lower().startswith('summit'):
                if len(words) >= 9:
                    entry.addOption(summit=int(words[4]),
                                    tags=int(words[5]),
                                    pValue=float(words[6]),
                                    fold=float(words[7]),
                                    fdr=float(words[8]))
                else:
                    entry.addOption(summit=int(words[4]),
                                    tags=int(words[5]),
                                    pValue=float(words[6]),
                                    fold=float(words[7]))
            elif format.lower().startswith('ccat'):
                entry.addOption(summit=int(words[1]) - entry.chromStart,
                                tags=int(words[4]),
                                bg=int(words[5]),
                                zscore=float(words[6]),
                                fdr=float(words[7]),
                                name='.',
                                score=int(words[4]),
                                strand='.')
            elif format.lower().startswith('crop'):
                entry.addOption(score=int(words[2]), name='.', strand='.')
                entry.chromEnd = entry.chromStart + 1
            # check if the chromosome has been seen before
            tree = chroms.get(chrom)
            if not tree:
                tree = ival.IntervalTree()
                chroms[chrom] = tree
            # put the entry in the interval tree for the appropriate chromosome
            iv = ival.Interval(entry.chromStart, entry.chromEnd)
            tree.put(iv, entry)
        except RuntimeError as e:
            if not acceptHeaderRows:
                raise RuntimeError('Error in BED file at row %d (%s)' %
                                   (row, e.strerror))
            else:
                headerRow = words
                acceptHeaderRows -= 1  # count down the number of header rows that can occur
    f.close()
    return chroms
示例#11
0
 def getInterval(self):
     return ival.Interval(self.chromStart, self.chromEnd)
示例#12
0
    print(next(g))
    print(next(g))
    cnt = 0
    collect = []
    for entry in bf:
        cnt += 1
        print(str(cnt) + '\t' + str(entry))
        collect.append(entry)
        if cnt == 7:
            for b in entry:
                print('\t', b)
        if cnt == 10:
            break
    writeBedFile(collect, '/Users/mikael/Desktop/test.bed')
    bf2 = BedFile('/Users/mikael/Desktop/test.bed', 'opt')
    q = ival.Interval(3805000, 3806000)
    t2 = ival.IntervalTree()
    t2.put(q, "blah")
    for entry in bf2:
        if entry.isBlockOverlap(q):
            print('Found:', entry)
            tree = entry.getBlocks()
            t2.putAll(tree)
            for t in t2:
                print(t)

    entry1 = BedEntry('chrX', 3858266, 3858530)
    print(entry1 in bf)
    entry2 = BedEntry('chrX', 10047550, 10067694)
    for x in bf.getOverlap(entry2):
        print(x)