def repeat_masker_iterator(fh, alignment_index=None, header=True, verbose=False): """ Iterator for repeatmasker coordinate annotation files. These files describe the location of repeat occurrences. There is (optionally) a two-line header with the names of the fields (ignored by the iterator, if present). Each line is a record of an occurrence. The description of fields for each line is given in from_repeat_masker_string. :param fh: stream-like object, or string filename, to load the annotations from :param alignment_index: an IndexedFile for full alignments; keys should be repeat-masker IDs :param header: if True, expect and discard the two-line header; otherwise we will expect there is no header :param verbose: if True, output additional status messages about progress to stderr. """ strm = fh if type(fh).__name__ == "str": strm = open(fh) # try to get an idea of how much data we have... if verbose: try: total = os.path.getsize(strm.name) pind = ProgressIndicator(totalToDo=total, messagePrefix="completed", messageSuffix="of processing " + strm.name) except AttributeError as e: sys.stderr.write(str(e)) sys.stderr.write("completed [unknown] of processing index") verbose = False if header: # chomp first 2 lines next(strm) next(strm) for line in strm: if verbose: pind.done = strm.tell() pind.showProgress() line = line.strip() if line == "": continue rto = retrotransposon.from_repeat_masker_string(line) if alignment_index is not None: rto.pairwise_alignment =\ JustInTimePairwiseAlignment(alignment_index, rto.uniq_id) yield rto
def test_basic_iterator(self): elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann))] self.assertEqual(len(elems), 6) for i in range(0, len(elems)): an = retrotransposon.from_repeat_masker_string(self.indv_an[i]) self.assertEqual(elems[i], an) # alignments are not avaialble, so liftover should work only on coords; # just check one to make sure its working # elem[0]: 15, 67 -> 85, 141 - (53 to 57; gap_length = 14) self.assertEqual(elems[0].liftover(GenomicInterval("chr1", 10, 100)), [GenomicInterval("A#B", 128, 142, strand='+'), GenomicInterval("A#B", 113, 127, strand='+'), GenomicInterval("A#B", 98, 112, strand='+'), GenomicInterval("A#B", 86, 97, strand='+')])
def test_iterator_with_alignment_index(self): def extract_UID(rm_alignment): return rm_alignment.meta[repeatmaskerAlignments.RM_ID_KEY] s_io = StringIO.StringIO(self.rm_rc_1_input) index = IndexedFile(s_io, repeat_masker_alignment_iterator, extract_UID) elems = [x for x in repeat_masker_iterator(StringIO.StringIO(self.ann), alignment_index=index)] self.assertEqual(len(elems), 6) for i in range(0, len(elems)): an = retrotransposon.from_repeat_masker_string(self.indv_an[i]) self.assertEqual(elems[i], an) # alignments were provided, liftover should be using them; test one # to make sure they were matched up properly r = elems[0].liftover(GenomicInterval("chr1", 10, 100)) self.assertEqual(r, [(132, 142), (120, 131), (88, 118), (85, 87)]) # also test one of the ones that had no alignment; here we expect failure self.assertRaises(IndexError, elems[4].liftover, GenomicInterval("chr1", 15200, 15400))