def testBothExonsScheme(self): """ check that output exons have the right size and relative indices from chromsome indices """ # run the code... infh = StringIO.StringIO("\n".join(self.readLines)) outfh = StringIO.StringIO() processBED(infh, outfh, BOTH_EXONS) # see what we get.. outlines = outfh.getvalue().split("\n") outlines = [l for l in outlines if l.strip() != ""] for i in range(0, len(outlines), 2): first = outlines[i] second = outlines[i + 1] e1 = parseBEDString(first) e2 = parseBEDString(second) self.assertTrue(e1.name[:-2] == e2.name[:-2]) answer = len(e1) + len(e2) self.assertTrue(self.lengths[e1.name[:-2]] == answer) self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]]) self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
def testBothExonsScheme(self): """ check that output exons have the right size and relative indices from chromsome indices """ # run the code... infh = StringIO.StringIO("\n".join(self.readLines)) outfh = StringIO.StringIO() processBED(infh, outfh, BOTH_EXONS) # see what we get.. outlines = outfh.getvalue().split("\n") outlines = [l for l in outlines if l.strip() != ""] for i in range(0, len(outlines), 2): first = outlines[i] second = outlines[i + 1] e1 = parseBEDString(first) e2 = parseBEDString(second) self.assertTrue(e1.name[:-2] == e2.name[:-2]) answer = len(e1) + len(e2) self.assertTrue(self.lengths[e1.name[:-2]] == answer) self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]]) self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
def testSecondExonScheme(self): """ check that output exons have the right size and relative indices from chromosome indices. """ # run the code... infh = DummyInputStream(self.readLines) outfh = DummyOutputStream() processBED(infh, outfh, SECOND_EXON) # see what we get.. outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""] for i in range(0, len(outlines)): out = outlines[i] e2 = parseBEDString(out) gotAnswer = len(e2) r_len = self.readEnds[e2.name[:-2]] - self.readStarts[e2.name[:-2]] glob_s = (self.firstChromStarts[e2.name[:-2]] + self.readStarts[e2.name[:-2]]) expectedAns = r_len - (self.firstChromEnds[e2.name[:-2]] - (glob_s)) - 1 self.assertTrue(gotAnswer == expectedAns) self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
def testFirstExonScheme(self): """ check that output exons have the right size and relative indices from chromosome indices """ # run the code... infh = DummyInputStream(self.readLines) outfh = DummyOutputStream() processBED(infh, outfh, FIRST_EXON) # see what we get.. outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""] for i in range(0, len(outlines)): out = outlines[i] e1 = parseBEDString(out) gotAnswer = len(e1) read_start_global = (self.firstChromStarts[e1.name[:-2]] + self.readStarts[e1.name[:-2]]) expectedAns = self.firstChromEnds[e1.name[:-2]] - read_start_global + 1 self.assertTrue(gotAnswer == expectedAns) self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]])
def testSecondExonScheme(self): """ check that output exons have the right size and relative indices from chromosome indices. """ # run the code... infh = DummyInputStream(self.readLines) outfh = DummyOutputStream() processBED(infh, outfh, SECOND_EXON) # see what we get.. outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""] for i in range(0, len(outlines)): out = outlines[i] e2 = parseBEDString(out) gotAnswer = len(e2) r_len = self.readEnds[e2.name[:-2]] - self.readStarts[e2.name[:-2]] glob_s = (self.firstChromStarts[e2.name[:-2]] + self.readStarts[e2.name[:-2]]) expectedAns = r_len - (self.firstChromEnds[e2.name[:-2]] - (glob_s)) - 1 self.assertTrue(gotAnswer == expectedAns) self.assertTrue(e2.start == self.secondChromStarts[e2.name[:-2]])
def testFirstExonScheme(self): """ check that output exons have the right size and relative indices from chromosome indices """ # run the code... infh = DummyInputStream(self.readLines) outfh = DummyOutputStream() processBED(infh, outfh, FIRST_EXON) # see what we get.. outlines = [l.strip() for l in outfh.itemsWritten() if l.strip() != ""] for i in range(0, len(outlines)): out = outlines[i] e1 = parseBEDString(out) gotAnswer = len(e1) read_start_global = (self.firstChromStarts[e1.name[:-2]] + self.readStarts[e1.name[:-2]]) expectedAns = self.firstChromEnds[ e1.name[:-2]] - read_start_global + 1 self.assertTrue(gotAnswer == expectedAns) self.assertTrue(e1.end == self.firstChromEnds[e1.name[:-2]])
def processBED(infh, outhandle, scheme, verbose=False): for read in BEDIterator(infh, verbose=verbose): # split the chrom field to get the genomic indices.. y = collections.deque(read.chrom.split("_")) while len(y) > 5: a = y.popleft() a += ("_" + y.popleft()) y.appendleft(a) chrom = y[0] chrom1SeqStart = int(y[1]) chrom1SeqEnd = int(y[2]) chrom2SeqStart = int(y[3]) # arbitrarily decide the first exon contains the largest portion of # the read if both are the same firstExon = None secondExon = None if scheme != SECOND_EXON: firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1, chrom1SeqEnd, read.name, read.score, read.strand) if scheme != FIRST_EXON: end = chrom2SeqStart + (read.end - (chrom1SeqEnd - chrom1SeqStart)) - 1 secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name, read.score, read.strand) # we add %1 or %2 to the end of the read names so they can # be distinguished later if firstExon is not None: firstExon.name = firstExon.name + "%1" if secondExon is not None: secondExon.name = secondExon.name + "%2" if (scheme == FIRST_EXON) or \ (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "+"): out = str(firstExon) elif (scheme == SECOND_EXON) or \ (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "-"): out = str(secondExon) elif scheme == BOTH_EXONS: out = str(firstExon) + "\n" + str(secondExon) # sanity check -- make sure we create a valid output string for l in out.split("\n"): e = parseBEDString(l) if e.chrom.strip() == "": raise ValueError(" got an emtpy chrom -> " + str(read)) # write output outhandle.write(out + "\n")
def processBED(infh, outhandle, scheme, verbose=False): for read in BEDIterator(infh, verbose=verbose): # split the chrom field to get the genomic indices.. y = collections.deque(read.chrom.split("_")) while len(y) > 5: a = y.popleft() a += ("_" + y.popleft()) y.appendleft(a) chrom = y[0] chrom1SeqStart = int(y[1]) chrom1SeqEnd = int(y[2]) chrom2SeqStart = int(y[3]) # arbitrarily decide the first exon contains the largest portion of # the read if both are the same firstExon = None secondExon = None if scheme != SECOND_EXON: firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1, chrom1SeqEnd, read.name, read.score, read.strand) if scheme != FIRST_EXON: end = chrom2SeqStart + (read.end - (chrom1SeqEnd - chrom1SeqStart)) - 1 secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name, read.score, read.strand) # we add %1 or %2 to the end of the read names so they can # be distinguished later if firstExon is not None: firstExon.name = firstExon.name + "%1" if secondExon is not None: secondExon.name = secondExon.name + "%2" if (scheme == FIRST_EXON) or \ (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "+"): out = str(firstExon) elif (scheme == SECOND_EXON) or \ (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "-"): out = str(secondExon) elif scheme == BOTH_EXONS: out = str(firstExon) + "\n" + str(secondExon) # sanity check -- make sure we create a valid output string for l in out.split("\n"): e = parseBEDString(l) if e.chrom.strip() == "": raise ValueError(" got an emtpy chrom -> " + str(read)) # write output outhandle.write(out + "\n")
def testInclusion(self): """ if a read appears in the input, it should appear in the output and vice-versa. Number of occurances should be the same too (unless we're doing BOTH_EXONS, then it should be twice in the output) """ for scheme in self.schemes: infh = StringIO.StringIO("\n".join(self.readLines)) outfh = StringIO.StringIO() processBED(infh, outfh, scheme) # see what we get.. outlines = outfh.getvalue().split("\n") outlines = [l for l in outlines if l.strip() != ""] outnames = [parseBEDString(line).name[:-2] for line in outlines] self.assertTrue(set(outnames) == set(self.names.keys())) if scheme == BOTH_EXONS: len(outnames) / 2 == len(self.names)
def randomBEDElement(name=None, chrom=None, start=None, end=None, delim="\t", maxIndex=1000000): MAX_SCORE = 30 if name is None: name = randomName(10) if chrom is None: chrom = randomName(10) if start is None: start = int(random.random() * (maxIndex - 1)) if end is None: end = int(random.random() * (maxIndex - start) + start) score = int(random.random() * MAX_SCORE) strand = "-" if random.random() <= 0.5: strand = "+" line = delim.join([chrom, str(start), str(end), name, str(score), strand]) return parseBEDString(line)
def testInclusion(self): """ if a read appears in the input, it should appear in the output and vice-versa. Number of occurances should be the same too (unless we're doing BOTH_EXONS, then it should be twice in the output) """ for scheme in self.schemes: infh = StringIO.StringIO("\n".join(self.readLines)) outfh = StringIO.StringIO() processBED(infh, outfh, scheme) # see what we get.. outlines = outfh.getvalue().split("\n") outlines = [l for l in outlines if l.strip() != ""] outnames = [parseBEDString(line).name[:-2] for line in outlines] self.assertTrue(set(outnames) == set(self.names.keys())) if scheme == BOTH_EXONS: len(outnames) / 2 == len(self.names)
def randomBEDElement(name=None, chrom=None, start=None, end=None, delim="\t", maxIndex=1000000): MAX_SCORE = 30 if name is None: name = randomName(10) if chrom is None: chrom = randomName(10) if start is None: start = int(random.random() * (maxIndex - 1)) if end is None: end = int(random.random() * (maxIndex - start) + start) score = int(random.random() * MAX_SCORE) strand = "-" if random.random() <= 0.5: strand = "+" line = delim.join([chrom, str(start), str(end), name, str(score), strand]) return parseBEDString(line)
def testPairedIterator(self): debug = False in1 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\ "\t" + "+" + "\n" +\ "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "2" +\ "\t" + "-" + "\n" +\ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" +\ "\t" + "+" + "\n" +\ "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" +\ "\t" + "-" + "\n" in2 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\ "\t" + "+" + "\n" +\ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\ "\t" + "+" + "\n" +\ "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\ "\t" + "-" + "\n" in3 = "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "1" +\ "\t" + "+" + "\n" +\ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\ "\t" + "+" + "\n" +\ "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\ "\t" + "-" + "\n" +\ "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" +\ "\t" + "+" + "\n" # first, ignore strand, name and score and don't mirror missing elements e1 = [ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" + "\t" + "-" ] e2 = [ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-" ] e3 = [ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-" ] instms = [DummyInputStream(x) for x in [in1, in2, in3]] allOut = [ x for x in pairedBEDIterator(instms, mirror=False, mirrorScore=None, ignoreStrand=True, ignoreScore=True, ignoreName=True) ] got1, got2, got3 = [], [], [] for x1, x2, x3 in allOut: got1.append(x1) got2.append(x2) got3.append(x3) for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]), (got2, [parseBEDString(x, scoreType=float) for x in e2]), (got3, [parseBEDString(x, scoreType=float) for x in e3])]: if debug: sys.stderr.write("expect\n" + "\n".join([str(x) for x in e]) + "\n") sys.stderr.write("got\n" + "\n".join([str(x) for x in g]) + "\n") assert (g == e) # now, same sort order but include strand, and mirror missing elements # using a score of 0 e1 = [ "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "2" + "\t" + "-", "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" + "\t" + "-", "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+" ] e2 = [ "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "-", "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-", "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+" ] e3 = [ "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "1" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "-", "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-", "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" + "\t" + "+" ] instms = [DummyInputStream(x) for x in [in1, in2, in3]] allOut = [ x for x in pairedBEDIterator(instms, mirror=True, mirrorScore=0, ignoreStrand=False, ignoreScore=True, ignoreName=True) ] got1, got2, got3 = [], [], [] for x1, x2, x3 in allOut: got1.append(x1) got2.append(x2) got3.append(x3) for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]), (got2, [parseBEDString(x, scoreType=float) for x in e2]), (got3, [parseBEDString(x, scoreType=float) for x in e3])]: if debug: sys.stderr.write("expect\n" + "\n".join([str(x) for x in e]) + "\n") sys.stderr.write("got\n" + "\n".join([str(x) for x in g]) + "\n") assert (g == e)
def BEDIterator(filehandle, sortedby=None, verbose=False, scoreType=int, dropAfter=None): """ Get an iterator for a BED file :param filehandle: this can be either a string, or a stream-like object. In the former case, it is treated as a filename. The format of the file/stream must be BED. :param sortedby: if None, order is not checked. if == ITERATOR_SORTED_START, elements in file must be sorted by chrom and start index (an exception is raised if they are not) if == ITERATOR_SORTED_END, element must be sorted by chrom and end index. :param verbose: if True, output additional progress messages to stderr :param scoreType: The data type for scores (the fifth column) in the BED file. :param dropAfter: an int indicating that any fields after and including this field should be ignored as they don't conform to the BED format. By default, None, meaning we use all fields. Index from zero. :return: iterator where subsequent calls to next() yield the next BED element in the stream as a GenomicInterval object. """ chromsSeen = set() prev = None if type(filehandle).__name__ == "str": filehandle = open(filehandle) if verbose: try: pind = ProgressIndicator( totalToDo=os.path.getsize(filehandle.name), messagePrefix="completed", messageSuffix="of processing " + filehandle.name) except (AttributeError, OSError) as e: sys.stderr.write("BEDIterator -- warning: " + "unable to show progress for stream") verbose = False for line in filehandle: if verbose: pind.done = filehandle.tell() pind.showProgress() if line.strip() == "": continue try: e = parseBEDString(line, scoreType, dropAfter=dropAfter) except GenomicIntervalError as e: raise BEDError(str(e) + " on line " + line) # sorting by name? if ((sortedby == ITERATOR_SORTED_NAME and prev is not None) and (prev.name > e.name)): raise BEDError("bed file " + filehandle.name + " not sorted by element name" + " found " + e.name + " after " + prev.name) # first item if prev is None: chromsSeen.add(e.chrom) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise BEDError("bed file " + filehandle.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) if sortedby == ITERATOR_SORTED_END and prev.end > e.end: raise BEDError("bed file " + filehandle.name + " not sorted by end index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START or sortedby == ITERATOR_SORTED_END or sortedby == ITERATOR_SORTED_CHROM) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): try: e_fn = filehandle.name except AttributeError: e_fn = "UNNAMED STREAM" raise BEDError("BED file " + e_fn + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e
def testPairedIterator(self): debug = False in1 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\ "\t" + "+" + "\n" +\ "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "2" +\ "\t" + "-" + "\n" +\ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" +\ "\t" + "+" + "\n" +\ "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" +\ "\t" + "-" + "\n" in2 = "chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" +\ "\t" + "+" + "\n" +\ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\ "\t" + "+" + "\n" +\ "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\ "\t" + "-" + "\n" in3 = "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "1" +\ "\t" + "+" + "\n" +\ "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" +\ "\t" + "+" + "\n" +\ "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" +\ "\t" + "-" + "\n" +\ "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" +\ "\t" + "+" + "\n" # first, ignore strand, name and score and don't mirror missing elements e1 = ["chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" + "\t" + "-"] e2 = ["chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-"] e3 = ["chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-"] instms = [DummyInputStream(x) for x in [in1, in2, in3]] allOut = [x for x in pairedBEDIterator(instms, mirror=False, mirrorScore=None, ignoreStrand=True, ignoreScore=True, ignoreName=True)] got1, got2, got3 = [], [], [] for x1, x2, x3 in allOut: got1.append(x1) got2.append(x2) got3.append(x3) for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]), (got2, [parseBEDString(x, scoreType=float) for x in e2]), (got3, [parseBEDString(x, scoreType=float) for x in e3])]: if debug: sys.stderr.write("expect\n" + "\n".join([str(x) for x in e]) + "\n") sys.stderr.write("got\n" + "\n".join([str(x) for x in g]) + "\n") assert(g == e) # now, same sort order but include strand, and mirror missing elements # using a score of 0 e1 = ["chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "2" + "\t" + "-", "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "3" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "4" + "\t" + "-", "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+"] e2 = ["chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "1" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "-", "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-", "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "+"] e3 = ["chr1" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "0" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "1" + "\t" + "+", "chr1" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "0" + "\t" + "-", "chr1" + "\t" + "40" + "\t" + "47" + "\t" + "X" + "\t" + "2" + "\t" + "+", "chr2" + "\t" + "10" + "\t" + "15" + "\t" + "X" + "\t" + "3" + "\t" + "-", "chr3" + "\t" + "20" + "\t" + "25" + "\t" + "X" + "\t" + "4" + "\t" + "+"] instms = [DummyInputStream(x) for x in [in1, in2, in3]] allOut = [x for x in pairedBEDIterator(instms, mirror=True, mirrorScore=0, ignoreStrand=False, ignoreScore=True, ignoreName=True)] got1, got2, got3 = [], [], [] for x1, x2, x3 in allOut: got1.append(x1) got2.append(x2) got3.append(x3) for g, e in [(got1, [parseBEDString(x, scoreType=float) for x in e1]), (got2, [parseBEDString(x, scoreType=float) for x in e2]), (got3, [parseBEDString(x, scoreType=float) for x in e3])]: if debug: sys.stderr.write("expect\n" + "\n".join([str(x) for x in e]) + "\n") sys.stderr.write("got\n" + "\n".join([str(x) for x in g]) + "\n") assert(g == e)
def BEDIterator(filehandle, sortedby=None, verbose=False, scoreType=int, dropAfter=None): """ Get an iterator for a BED file :param filehandle: this can be either a string, or a stream-like object. In the former case, it is treated as a filename. The format of the file/stream must be BED. :param sortedby: if None, order is not checked. if == ITERATOR_SORTED_START, elements in file must be sorted by chrom and start index (an exception is raised if they are not) if == ITERATOR_SORTED_END, element must be sorted by chrom and end index. :param verbose: if True, output additional progress messages to stderr :param scoreType: The data type for scores (the fifth column) in the BED file. :param dropAfter: an int indicating that any fields after and including this field should be ignored as they don't conform to the BED format. By default, None, meaning we use all fields. Index from zero. :return: iterator where subsequent calls to next() yield the next BED element in the stream as a GenomicInterval object. """ chromsSeen = set() prev = None if type(filehandle).__name__ == "str": filehandle = open(filehandle) if verbose: try: pind = ProgressIndicator(totalToDo=os.path.getsize(filehandle.name), messagePrefix="completed", messageSuffix="of processing " + filehandle.name) except (AttributeError, OSError) as e: sys.stderr.write("BEDIterator -- warning: " + "unable to show progress for stream") verbose = False for line in filehandle: if verbose: pind.done = filehandle.tell() pind.showProgress() if line.strip() == "": continue try: e = parseBEDString(line, scoreType, dropAfter=dropAfter) except GenomicIntervalError as e: raise BEDError(str(e) + " on line " + line) # sorting by name? if ((sortedby == ITERATOR_SORTED_NAME and prev is not None) and (prev.name > e.name)): raise BEDError("bed file " + filehandle.name + " not sorted by element name" + " found " + e.name + " after " + prev.name) # first item if prev is None: chromsSeen.add(e.chrom) # on same chrom as the prev item, make sure order is right if prev is not None and sortedby is not None and e.chrom == prev.chrom: if sortedby == ITERATOR_SORTED_START and prev.start > e.start: raise BEDError("bed file " + filehandle.name + " not sorted by start index - saw item " + str(prev) + " before " + str(e)) if sortedby == ITERATOR_SORTED_END and prev.end > e.end: raise BEDError("bed file " + filehandle.name + " not sorted by end index - saw item " + str(prev) + " before " + str(e)) # starting a new chrom.. make sure we haven't already seen it if prev is not None and prev.chrom != e.chrom: if (sortedby == ITERATOR_SORTED_START or sortedby == ITERATOR_SORTED_END or sortedby == ITERATOR_SORTED_CHROM) and\ (e.chrom in chromsSeen or prev.chrom > e.chrom): try: e_fn = filehandle.name except AttributeError: e_fn = "UNNAMED STREAM" raise BEDError("BED file " + e_fn + " not sorted by chrom") chromsSeen.add(e.chrom) # all good.. yield e prev = e