Пример #1
0
    def __init__(self, entries, format = 'Limited', bigBed=False):
        """
        Create a BedFile instance.
        :param entries: an iterable of entries or a filename
        :param format: the format of the BED file
        """
        self.format = format
        if bigBed:
            self.chroms = bigbed.readBigBed(entries)
        elif isinstance(entries, str): # filename
            try:
                self.chroms = readBedFile(entries, format)
            except UnicodeDecodeError:
                self.chroms = bigbed.readBigBed(entries)
        else:
            self.chroms = dict()
            if format.lower().startswith('bedpe'):
                for entry in entries:
                    for num in range(1, 3):
                        if num == 1:
                            tree = self.chroms.get(entry.chrom1)
                            if not tree:
                                tree = ival.IntervalTree()
                                self.chroms[entry.chrom1] = tree
                            iv = ival.Interval(entry.chromStart1, entry.chromEnd1)
                            tree.put(iv, entry)

                        elif num == 2:
                            tree = self.chroms.get(entry.chrom2)
                            if not tree:
                                tree = ival.IntervalTree()
                                self.chroms[entry.chrom2] = tree
                                # put the entry in the interval tree for the appropriate chromosome
                                iv = ival.Interval(entry.chromStart2, entry.chromEnd2)
                                tree.put(iv, entry)
            else:
                for entry in entries:
                    # check if the chromosome has been seen before
                    tree = self.chroms.get(entry.chrom)
                    if not tree:
                        tree = ival.IntervalTree()
                        self.chroms[entry.chrom] = tree
                    # put the entry in the interval tree for the appropriate chromosome
                    iv = ival.Interval(entry.chromStart, entry.chromEnd)
                    tree.put(iv, entry)
Пример #2
0
 def poolBED(self, bedfile):
     for entry in bedfile:
         # check if the chromosome has been seen before
         tree = self.chroms.get(entry.chrom)
         if not tree:
             tree = ival.IntervalTree()
             self.chroms[entry.chrom] = tree
         # put the entry in the interval tree for the appropriate chromosome
         iv = ival.Interval(entry.chromStart, entry.chromEnd)
         tree.put(iv, entry)
Пример #3
0
def readBigBed(filename):

    file = pyBigWig.open(filename)
    chroms = dict()

    if file.isBigBed():
        if file.SQL().decode('utf8').lower().find(
                'narrowpeak') != -1 or file.SQL().decode('utf8').lower().find(
                    'broadpeak') != -1:
            for chrom in file.chroms():
                entries = file.entries(chrom, 0, file.chroms(chrom))
                for entry in entries:
                    try:
                        words = entry[2].strip().split()
                        chromStart = int(entry[0])
                        chromEnd = int(entry[1])
                        bed_entry = bed.BedEntry(chrom, chromStart, chromEnd)

                        if len(words) >= 7:  # narrowpeaks
                            bed_entry.addOption(name=words[0],
                                                score=int(words[1]),
                                                strand=words[2],
                                                signalValue=float(words[3]),
                                                pValue=float(words[4]),
                                                qValue=float(words[5]),
                                                peak=int(words[6]))
                        else:  # broadpeaks
                            bed_entry.addOption(name=words[0],
                                                score=int(words[1]),
                                                strand=words[2],
                                                signalValue=float(words[3]),
                                                pValue=float(words[4]),
                                                qValue=float(words[5]))

                        # check if the chromosome has been seen before
                        tree = chroms.get(chrom)
                        if not tree:
                            tree = ival.IntervalTree()
                            chroms[chrom] = tree
                        # put the entry in the interval tree for the appropriate chromosome
                        iv = ival.Interval(bed_entry.chromStart,
                                           bed_entry.chromEnd)
                        tree.put(iv, bed_entry)
                    except RuntimeError as e:
                        raise RuntimeError('Error in BIGBED file (%s)' %
                                           (e.strerror))

        else:
            print("BigBed file not ENCODE narrowPeak or broadPeak")
    file.close()
    return chroms
Пример #4
0
    def __contains__(self, item):
        if isinstance(item, BedEntry):
            tree = self.chroms.get(item.chrom)
            if tree is None:
                return False
            else:
                return ival.Interval(item.chromStart, item.chromEnd) in tree

        elif isinstance(item, BedPE):
            tree1 = self.chroms.get(item.chrom1)
            tree2 = self.chroms.get(item.chrom2)
            container = []
            if tree1 is None:
                container.append(False)
            else:
                container.append(ival.Interval(item.chromStart1, item.chromEnd1) in tree1)
            if tree2 is None:
                container.append(False)
            else:
                container.append(ival.Interval(item.chromStart2, item. chromEnd2) in tree2)
            return container
        else:
            return False
Пример #5
0
 def getClosest(self, item):
     if isinstance(item, BedEntry):
         tree = self.chroms.get(item.chrom)
         if tree is None:
             return None
         else:
             iv = ival.Interval(item.chromStart, item.chromEnd)
             node = tree.closest(iv)
             if node is not None:
                 return node.values
             else:
                 return None
     elif isinstance(item, BedPE):
         tree1 = self.chroms.get(item.chrom1)
         tree2 = self.chroms.get(item.chrom2)
         container = []
         if tree1 is None:
             container.append(None)
         else:
             iv = ival.Interval(item.chromStart1, item.chromEnd1)
             node = tree1.closest(iv)
             if node is not None:
                 container.append(node.values)
             else:
                 container.append(None)
         if tree2 is None:
             container.append(None)
         else:
             iv = ival.Interval(item.chromStart1, item.chromEnd1)
             node = tree2.closest(iv)
             if node is not None:
                 container.append(node.values)
             else:
                 container.append(None)
         return container
     else:
         return None
Пример #6
0
 def getOverlap(self, item):
     if isinstance(item, BedEntry):
         tree = self.chroms.get(item.chrom)
         if tree is None: return None
         else:
             iv = ival.Interval(item.chromStart, item.chromEnd)
             res = tree.isectall(iv)
             ret = []
             for r in res:
                 ret.extend(r.values)
             return ret
     elif isinstance(item, BedPE):
         tree1 = self.chroms.get(item.chrom1)
         tree2 = self.chroms.get(item.chrom2)
         container = []
         if tree1 is None:
             container.append(None)
         else:
             iv = ival.Interval(item.chromStart1, item.chromEnd1)
             res = tree1.isectall(iv)
             ret = []
             for r in res:
                 ret.extend(r.values)
             container.append(ret)
         if tree2 is None:
             container.append(None)
         else:
             iv = ival.Interval(item.chromStart2, item.chromEnd2)
             res = tree2.isectall(iv)
             ret = []
             for r in res:
                 ret.extend(r.values)
             container.append(ret)
         return container
     else:
         return None
Пример #7
0
def readBedFile(filename, format = 'Limited'):
    """ Read a BED file.
        format: specifies the format of the file,
        "Limited", e.g.
            chr22 1000 5000
            chr22 2000 6000
        "Optional", e.g.
            track name=pairedReads description="Clone Paired Reads" useScore=1
            chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
            chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
            ...
            (also handles the Limited + score, and BED6 format)
        "Peaks", e.g.
            chr1    569780    569930    .    0    .    19    6.07811    -1    -1
            chr1    713300    713450    .    0    .    54    49.1167    -1    -1
        "Strand", e.g.
            chr4    185772359    185772424    -
            chr18    20513381    20513401    +
        also supports a 5th label field
            chr5    20611949        20611949        +       ENSG00000251629_20611949
            chr3    42187863        42187863        -       ENSG00000234562_42187863
        "Summit", e.g.
            # d = 130
            chr      start    end   length summit  tags -10*log10(pvalue)    fold_enrichment    FDR(%)
            chr1     8250     8671    422    286    46    145.84    11.68    0.51
            chr1    36382    36984    603    405    46    315.23    27.05    0.24
        "CCAT", e.g.
            chr8    94747805    94747070    94749250    525     3    21.519196    0.002000
            chr17   55277895    55277070    55279280    560    18    21.283333    0.002000
        "Cropped", e.g.
            chr1    851602    10
            chr1    921184    18
            chr1    931838    9
        "BedPE", e.g.
            chrom1  chromStart1 chromEnd1   chrom2  chromStart2 chromEnd2 + any number of additional fields
            chr1    85617       86100       chr1    120030      125039
            chr2    73891       74871       chr5    12709       12990
    """
    f = open(filename)
    row = 0
    acceptHeaderRows = 1
    headerRow = None
    sissrs = False
    gem = False
    start = False
    chroms = dict()
    for line in f:
        row += 1
        words = line.strip().split()
        if len(words) == 0:
            continue
        if words[0].strip().startswith('='):
            sissrs = True
            continue
        if words[0].strip().startswith('Position'):
            gem = True
            continue
        if sissrs:
            if words[0].strip().startswith('-'):
                start = True
                continue
            elif start:
                chrom = words[0]
                chromStart = int(words[1])
                chromEnd = int(words[2])
                entry = BedEntry(chrom, chromStart, chromEnd)
                entry.addOption(signalValue=int(words[3]), name = " ", score = int(words[3]), strand = '.',
                                pValue = float(-1), qValue = float(-1), peak = int(-1))

                # check if the chromosome has been seen before
                tree = chroms.get(chrom)
                if not tree:
                    tree = ival.IntervalTree()
                    chroms[chrom] = tree
                # put the entry in the interval tree for the appropriate chromosome
                iv = ival.Interval(entry.chromStart, entry.chromEnd)
                tree.put(iv, entry)
            else:
                continue
        elif gem:
            chrom, centre = words[0].split(':')
            centre = int(centre)
            chromStart = centre - 50
            chromEnd = centre + 50
            entry = BedEntry(chrom, chromStart, chromEnd)
            entry.addOption(signalValue=float(words[1]), name=" ", score=float(words[7]), strand=words[13], peak=centre,
                            pValue=float(words[6]), qValue=float(words[5]))
            tree = chroms.get(chrom)
            if not tree:
                tree = ival.IntervalTree()
                chroms[chrom] = tree
            # put the entry in the interval tree for the appropriate chromosome
            iv = ival.Interval(entry.chromStart, entry.chromEnd)
            tree.put(iv, entry)
        else:
            if len(words) == 0:
                continue # ignore empty lines
            if words[0].strip().startswith('#'):
                continue # comment
            if words[0].strip().startswith('browser'):
                continue # ignore
            if words[0].strip().startswith('track'):
                continue # ignore
            if words[1].strip().startswith('start'):
                continue # ignore
            try:
                if format.lower().startswith('bedpe'):
                    chrom1 = words[0]
                    chromStart1 = int(words[1])
                    chromEnd1 = int(words[2])
                    chrom2 = words[3]
                    chromStart2 = int(words[4])
                    chromEnd2 = int(words[5])

                    entry = BedPE(chrom1, chromStart1, chromEnd1, chrom2, chromStart2, chromEnd2)
                    if len(words) == 8:
                        entry.addOption(PETs=int(words[6]), pValue=float(words[7]))
                    if len(words) == 13:
                        entry.addOption(name1=words[6], name2=words[7], depth1=int(words[8]), depth2=int(words[9]),
                                        PETs=int(words[10]), pValue=float(words[11]), fdr=float(words[12]))
                    if chrom1 == chrom2:
                        tree = chroms.get(chrom1)
                        if not tree:
                            tree = ival.IntervalTree()
                            chroms[chrom1] = tree
                        iv1 = ival.Interval(entry.chromStart1, entry.chromEnd1)
                        iv2 = ival.Interval(entry.chromStart2, entry.chromEnd2)
                        tree.put(iv1, entry)
                        tree.put(iv2, entry)

                    else:
                        tree1 = chroms.get(chrom1)
                        tree2 = chroms.get(chrom2)
                        if not tree1:
                            tree1 = ival.IntervalTree()
                            chroms[chrom1] = tree1
                        if not tree2:
                            tree2 = ival.IntervalTree()
                            chroms[chrom2] = tree2
                        # put the entry in the interval tree for the appropriate chromosome
                        iv1 = ival.Interval(entry.chromStart1, entry.chromEnd1)
                        iv2 = ival.Interval(entry.chromStart2, entry.chromEnd2)
                        tree1.put(iv1, entry)
                        tree2.put(iv2, entry)
                else:
                    chrom = words[0]
                    if format.lower().startswith('ccat'):
                        chromStart = int(words[2])
                        chromEnd = int(words[3])
                    else: # all other standard BED formats
                        try:
                            chromStart = int(words[1])
                            chromEnd = int(words[2])
                        except ValueError:
                            print(words)
                            continue
                    entry = BedEntry(chrom, chromStart, chromEnd)
                    if format.lower().startswith('opt'):
                        if len(words) >= 9:
                            entry.addOption(name = words[3], score = float(words[4]), strand = words[5],
                                            thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8])
                        elif len(words) >= 6:
                            entry.addOption(name = words[3], score = float(words[4]), strand = words[5])
                        elif len(words) >= 5:
                            entry.addOption(name = words[3], score = float(words[4]))
                        elif len(words) >= 4:
                            entry.addOption(name = words[3])
                        else:
                            entry.addOption(name = '.', score = int(words[3]), strand = '.')
                    elif format.lower().startswith('bed6'):
                        entry.addOption(name=words[3], score=float(words[4]), strand=words[5])
                    elif format.lower().startswith('strand'):
                        if len(words) >= 4: # properly formatted
                            entry.addOption(strand = words[3])
                        if len(words) >= 5:
                            entry.addOption(name = words[4])
                    elif format.lower().startswith('peak'):
                        if len(words) >= 10: # narrowpeaks
                            entry.addOption(name = words[3], score = int(words[4]), strand = words[5],
                                            signalValue = float(words[6]), pValue = float(words[7]),
                                            qValue = float(words[8]), peak = int(words[9]))
                        else: # broadpeaks
                            entry.addOption(name = words[3], score = int(words[4]), strand = words[5],
                                            signalValue = float(words[6]), pValue = float(words[7]),
                                            qValue = float(words[8]))
                    elif format.lower().startswith('rp'):
                        entry.addOption(name=words[3], score=int(words[4]), strand=words[5],
                                        signalValue=float(words[6]), pValue=float(words[7]),
                                        qValue=float(words[8]), rank=[float(r) for r in list(words[9].split(","))])
                    elif format.lower().startswith('idr'):
                        entry.addOption(name=words[3], score=int(words[4]), strand=words[5],
                                        signalValue=float(words[6]), pValue=float(words[7]),
                                        qValue=float(words[8]))
                    elif format.lower().startswith('2idr'):
                        #For IDR input with actual IDR values
                        entry.addOption(name=words[3], pValue=float(words[9]),
                                        qValue=float(words[10]))
                    elif format.lower().startswith('summit'):
                        if len(words) >= 9:
                            entry.addOption(summit = int(words[4]), tags = int(words[5]), pValue = float(words[6]),
                                            fold = float(words[7]), fdr = float(words[8]))
                        else:
                            entry.addOption(summit = int(words[4]), tags = int(words[5]),
                                            pValue = float(words[6]), fold = float(words[7]))
                    elif format.lower().startswith('ccat'):
                        entry.addOption(summit = int(words[1]) - entry.chromStart, tags = int(words[4]), bg = int(words[5]),
                                        zscore = float(words[6]), fdr = float(words[7]), name = '.',
                                        score = int(words[4]), strand = '.')
                    elif format.lower().startswith('crop'):
                        entry.addOption(score = int(words[2]), name = '.', strand = '.')
                        entry.chromEnd = entry.chromStart + 1
                    elif format.lower().startswith('bed12'):
                        entry.addOption(name=words[3], score=float(words[4]), strand=words[5], thickStart=int(words[6]),
                                        thickEnd=int(words[7]), itemRgb=words[8], blockCount=int(words[9]),
                                        blockSizes=words[10], blockStarts=words[11])
                    elif format.lower().startswith('TSS'):
                        entry.addOption(name=str(words[3]), gene=str(words[4]), strand=words[5])
                    elif format.lower().startswith('mspc'):
                        entry.addOption(name=str(words[3]), signalValue=float(words[4]))

                    # check if the chromosome has been seen before
                    tree = chroms.get(chrom)
                    if not tree:
                        tree = ival.IntervalTree()
                        chroms[chrom] = tree
                    # put the entry in the interval tree for the appropriate chromosome
                    iv = ival.Interval(entry.chromStart, entry.chromEnd)
                    tree.put(iv, entry)
            except RuntimeError as e:
                if not acceptHeaderRows:
                    raise RuntimeError('Error in BED file at row %d (%s)' % (row, e.strerror))
                else:
                    headerRow = words
                    acceptHeaderRows -= 1 # count down the number of header rows that can occur
    f.close()
    return chroms
Пример #8
0
 def getInterval(self):
     return ival.Interval(self.chromStart, self.chromEnd)
Пример #9
0
 def getInterval(self):
     return [ival.Interval(self.chromStart1, self.chromEnd1), ival.Interval(self.chromStart2, self.chromEnd2)]