示例#1
0
 def test_different_ungapped_legnths(self):
   """Test that we can build an MSA with seqs of diff ungapped length."""
   s1 = Sequence("s1", "------TCGCGTAGC", 100, 129)
   s2 = Sequence("s2", "---------CGCAGC", 1000, 1006)
   s3 = Sequence("s3", "ATCGCGT--------", 120, 127)
   msa = MultipleSequenceAlignment([s1, s2, s3])
   self.assertEqual(msa.get_column(7), {"s1": "T", "s2": "-", "s3": "T"})
示例#2
0
 def test_empty_seq_with_diff_length(self):
   """Test using set of sequences where an empty seq doesn't match in size."""
   s1 = Sequence("s1", "-TCGCGTAGC---CGC-TAGCTGATGCGAT-CTGA", 100, 129)
   s2 = Sequence("s2", "ATCGCGTAGCTAGCGCG-AGCTG---CGATGCT--", 1000, 1029)
   s3 = UnknownSequence("s3", 25049, 25049 + 1601, "+", 50103)
   msa = MultipleSequenceAlignment([s1, s2, s3])
   self.assertEqual(msa.get_column(1), {"s1": "-", "s2": "A"})
示例#3
0
文件: read.py 项目: pjuren/pyokit
    def __init__(self, seq, name=None, qual=None, use_mut_str=False):
        """
      Constructor for FastqSequence class; see class level documentation for
      descriptions of parameters.
    """
        if len(seq) != len(qual):
            msg = (
                "failed to create FastqSequence object -- length of sequence "
                + "data (" + str(len(seq)) +
                ") does not match length of quality " + "string (" +
                str(len(qual)) + "). seq data: " + seq + "  qual data: " +
                qual)
            raise NGSReadError(msg)

        Sequence.__init__(self, name, seq, use_mut_str)

        self.seq_qual = qual

        # for quality scores
        # ILLUMINA 1.3+ Phred+64
        self.LOWSET_SCORE = 64
        self.HIGHEST_SCORE = 104
        # Illumina 1.8+ Phred+33
        self.LOWSET_SCORE_ILL_18_PHRD_33 = 33
        self.HIGHEST_SCORE_ILL_18_PHRD_33 = 74
示例#4
0
文件: read.py 项目: pjuren/pyokit
    def __init__(self, seq, name=None, qual=None, use_mut_str=False):
        """
      Constructor for FastqSequence class; see class level documentation for
      descriptions of parameters.
    """
        if len(seq) != len(qual):
            msg = (
                "failed to create FastqSequence object -- length of sequence "
                + "data ("
                + str(len(seq))
                + ") does not match length of quality "
                + "string ("
                + str(len(qual))
                + "). seq data: "
                + seq
                + "  qual data: "
                + qual
            )
            raise NGSReadError(msg)

        Sequence.__init__(self, name, seq, use_mut_str)

        self.seq_qual = qual

        # for quality scores
        self.LOWSET_SCORE = 64
        self.HIGHEST_SCORE = 104
示例#5
0
  def setUp(self):
    """Set up a few alignments to use in the tests."""
    s1 = Sequence("s1", "-TCGCGTAGC---CGC-TAGCTGATGCGAT-CTGA", 100, 129)
    s2 = Sequence("s2", "ATCGCGTAGCTAGCGCG-AGCTG---CGATGCT--", 1000, 1029)
    s3 = Sequence("s3", "ATCGCGTAGCTAGCGCG-AGCTG---CGATGCT--", 969, 998, "-")

    self.pa1 = PairwiseAlignment(s1, s2)
    self.pa2 = PairwiseAlignment(s1, s3)
    self.msa1 = MultipleSequenceAlignment([s1, s2, s3])
示例#6
0
文件: read.py 项目: pjuren/pyokit
 def testClipadaptor(self):
     input_seq = NGSRead("ACTGCTAGCGATCGACT", "n1", "QQQQQQQQQQQQQQQQQ")
     adaptor = Sequence("adap", "AGCGATAGACT")
     expect = NGSRead("ACTGCTNNNNNNNNNNN", "n1", "QQQQQQQQQQQQQQQQQ")
     clip_adaptor(input_seq, adaptor)
     got = input_seq
     self.assertTrue(expect == got)
示例#7
0
文件: read.py 项目: pjuren/pyokit
    def __eq__(self, seq):
        """
      determine whether two fastqSequence objects are equal. To be equal, their
      sequence data (name, nuc. sequence) must match, as well as their quality
      data.

      :param seq: the other sequence to compare against.
      :return: True if this sequence is equal to seq, else False.
    """
        return Sequence.__eq__(self, seq) and self.seq_qual == seq.seq_qual
示例#8
0
文件: read.py 项目: pjuren/pyokit
    def __eq__(self, seq):
        """
      determine whether two fastqSequence objects are equal. To be equal, their
      sequence data (name, nuc. sequence) must match, as well as their quality
      data.

      :param seq: the other sequence to compare against.
      :return: True if this sequence is equal to seq, else False.
    """
        return (Sequence.__eq__(self, seq) and self.seq_qual == seq.seq_qual)
示例#9
0
文件: read.py 项目: pjuren/pyokit
    def __ne__(self, read):
        """
      determine whether two fastqSequence objects are not equal. They are
      considered unequal if any of their sequence data (name, nuc. sequence)
      does not match, or if their quality data does not match.

      :param seq: the other sequence to compare against.
      :return: True if this sequence is not equal to seq, else False.
    """

        return (Sequence.__ne__(self, read) or self.seq_qual != read.seq_qual)
示例#10
0
文件: read.py 项目: pjuren/pyokit
    def __ne__(self, read):
        """
      determine whether two fastqSequence objects are not equal. They are
      considered unequal if any of their sequence data (name, nuc. sequence)
      does not match, or if their quality data does not match.

      :param seq: the other sequence to compare against.
      :return: True if this sequence is not equal to seq, else False.
    """

        return Sequence.__ne__(self, read) or self.seq_qual != read.seq_qual
示例#11
0
文件: maf.py 项目: pjuren/pyokit
def __build_sequence(parts):
    """Build a sequence object using the pre-tokenized parts from a MAF line.

  s -- a sequence line; has 6 fields in addition to 's':
          * source sequence,
          * start coord. of seq., zero-based. If -'ve strand, rel to start of
            rev. comp.
          * ungapped length of the sequence
          * strand
          * src size -- the full length of the source sequence
          * the sequence itself
  """
    strand = parts[4]
    seq_length = int(parts[3])
    total_seq_len = int(parts[5])
    start = (int(parts[2]) if strand == "+" else total_seq_len -
             int(parts[2]) - seq_length)
    end = start + seq_length
    remain = total_seq_len - end
    return Sequence(parts[1], parts[6], start, end, strand, remain)
示例#12
0
def fastaIterator(fn, useMutableString=False, verbose=False):
  """
    A generator function which yields fastaSequence objects from a fasta-format
    file or stream.

    :param fn: a file-like stream or a string; if this is a string, it's
               treated as a filename, else it's treated it as a file-like
               object, which must have a readline() method.
    :param useMustableString: if True, construct sequences from lists of chars,
                              rather than python string objects, to allow
                              more efficient editing. Use with caution.
    :param verbose: if True, output additional status messages to stderr about
                    progress
  """
  fh = fn
  if type(fh).__name__ == "str":
    fh = open(fh)

  if verbose:
    try:
      pind = __build_progress_indicator(fh)
    except ProgressIndicatorError as e:
      sys.stderr.write("Warning: unable to show progress for stream. " +
                       "Reason: " + str(e))
      verbose = False

  prev_line = None
  while True:
    seqHeader = __read_seq_header(fh, prev_line)
    name = seqHeader[1:].strip()
    seq_data, prev_line = __read_seq_data(fh)
    if verbose:
      pind.done = fh.tell()
      pind.showProgress(to_strm=sys.stderr)
    yield Sequence(name, seq_data, useMutableString)

    # remember where we stopped for next call, or finish
    if prev_line == "":
      break
示例#13
0
 def test_failure_on_different_lengths(self):
   """Test failure when sequences passed to constructor are ragged."""
   s1 = Sequence("s1", "-TCGCGTAGC---CGC-TAGGATGCGAT-CTGA", 100, 127)
   s2 = Sequence("s2", "ATCGCGTAGCTAGCGCG-AGCTG---CGATGCT--", 1000, 1029)
   args = [s1, s2]
   self.assertRaises(MultipleAlignmentError, MultipleSequenceAlignment, args)
示例#14
0
文件: maf.py 项目: pjuren/pyokit
    def setUp(self):
        """Set up some MAF files to use in unit tests."""
        b1_hg19_seq = "atctccaagagggcataaaacac-tgagtaaacagctcttttatatgtgtttcctgga"
        b1_panTro_s = "atctccaagagggcataaaacac-tgagtaaacagctctt--atatgtgtttcctgga"
        b1_panTro_q = "99999999999999999999999-9999999999999999--9999999999999999"
        b1_tarSyr_s = "atctccaagagggctgaaaatgc-caaatga-----------tcacacgtttcctgga"
        b1_tarSyr_q = "79295966999999999999998-9999799-----------9999999999765775"
        b1_tupBel_s = "ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactgga"
        b1_tupBel_q = "79648579699867994997775679665662767577569-6998745597677632"
        b1 = "a score=28680.000000\n" +\
             "s hg19.chr22             1711 57 + 51304566 " + b1_hg19_seq + "\n" +\
             "s panTro2.chrUn          1110 59 + 58616431 " + b1_panTro_s + "\n" +\
             "q panTro2.chrUn                             " + b1_panTro_q + "\n" +\
             "i panTro2.chrUn          C 0 C 0                          " + "\n" +\
             "s tarSyr1.scaffold_5923  2859 50 -     8928 " + b1_tarSyr_s + "\n" +\
             "q tarSyr1.scaffold_5923                     " + b1_tarSyr_q + "\n" +\
             "i tarSyr1.scaffold_5923  N 0 C 0                          " + "\n" +\
             "s tupBel1.scaffold_803   33686 61 +   85889 " + b1_tupBel_s + "\n" +\
             "q tupBel1.scaffold_803                      " + b1_tupBel_q + "\n" +\
             "i tupBel1.scaffold_803   I 1 C 0                          " + "\n" +\
             "e mm4.chr6            53310102 58 + 151104725 I"
        self.b1_hg19 = Sequence("hg19.chr22", b1_hg19_seq, 1711, 1768, "+",
                                51302798)
        self.b1_panTro = Sequence(
            "panTro2.chrUn", b1_panTro_s, 1110, 1169, "+", 58616431 - 1169, {
                QUALITY_META_KEY: b1_panTro_q,
                LEFT_STATUS_KEY: "C",
                LEFT_COUNT_KEY: 0,
                RIGHT_STATUS_KEY: "C",
                RIGHT_COUNT_KEY: 0
            })
        self.b1_tarSyr = Sequence(
            "tarSyr1.scaffold_5923", b1_tarSyr_s, 8928 - 2859 - 50,
            8928 - 2859, "-", 2859, {
                QUALITY_META_KEY: b1_tarSyr_q,
                LEFT_STATUS_KEY: "N",
                LEFT_COUNT_KEY: 0,
                RIGHT_STATUS_KEY: "C",
                RIGHT_COUNT_KEY: 0
            })
        self.b1_mm4 = UnknownSequence("mm4.chr6", 53310102, 53310102 + 58, "+",
                                      151104725 - (53310102 + 58),
                                      {EMPTY_ALIGNMENT_STATUS_KEY: "I"})

        b2_hg19_seq = "ccttcttttaattaattttgttaagg----gatttcctctagggccactgcacgtca"
        b2_panTro_s = "ccttcttttaattaattttgttatgg----gatttcgtctagggtcactgcacatca"
        b2_panTro_q = "99999999999999999999999999----999999099999999999999999999"
        b2_tarSyr_s = "tcttcttttaattaattttattgagggattgattccttattgggccactacacatta"
        b2_tarSyr_q = "999999899978999999999999999977989997998678865952859999899"
        b2_tupBel_s = "cct--gtttaaattactgtattg-gg----gatttcctatagggccgcttctcgtcc"
        b2_tupBel_q = "666--958759455555746366-68----656846556554745443677468565"
        b2 = "a score=31725.000000\n" +\
             "s hg19.chr22             1772 53 + 51304566 " + b2_hg19_seq + "\n" +\
             "s panTro2.chrUn          1169 53 + 58616431 " + b2_panTro_s + "\n" +\
             "q panTro2.chrUn                             " + b2_panTro_q + "\n" +\
             "i panTro2.chrUn          C 0 C 0                          " + "\n" +\
             "s tarSyr1.scaffold_5923  2909 124 -    8928 " + b2_tarSyr_s + "\n" +\
             "q tarSyr1.scaffold_5923                     " + b2_tarSyr_q + "\n" +\
             "i tarSyr1.scaffold_5923  C 0 N 0                          " + "\n" +\
             "s tupBel1.scaffold_803   33747 113 +  85889 " + b2_tupBel_s + "\n" +\
             "q tupBel1.scaffold_803                      " + b2_tupBel_q + "\n" +\
             "i tupBel1.scaffold_803 C 0 N 0              "
        self.maf1 = b1 + "\n\n" + b2
        self.b2_hg19 = Sequence("hg19.chr22", b2_hg19_seq, 1772, 1825, "+",
                                51302741)
        self.b2_panTro = Sequence(
            "panTro2.chrUn", b2_panTro_s, 1169, 1169 + 53, "+",
            58616431 - (1169 + 53), {
                QUALITY_META_KEY: b2_panTro_q,
                LEFT_STATUS_KEY: "C",
                LEFT_COUNT_KEY: 0,
                RIGHT_STATUS_KEY: "C",
                RIGHT_COUNT_KEY: 0
            })
        self.b2_tarSyr = Sequence(
            "tarSyr1.scaffold_5923", b2_tarSyr_s, 8928 - 2909 - 124,
            8928 - 2909, "-", 2909, {
                QUALITY_META_KEY: b2_tarSyr_q,
                LEFT_STATUS_KEY: "C",
                LEFT_COUNT_KEY: 0,
                RIGHT_STATUS_KEY: "N",
                RIGHT_COUNT_KEY: 0
            })
        self.b1 = b1
        self.b2 = b2
示例#15
0
文件: read.py 项目: pjuren/pyokit
 def reverse_complement(self, is_RNA=None):
     """
   Reverse complement this read in-place.
 """
     Sequence.reverseComplement(self, is_RNA)
     self.seq_qual = self.seq_qual[::-1]
示例#16
0
  def setUp(self):
    """Set up some genome alignment blocks and genome alignments for tests."""
    self.block1 = GenomeAlignmentBlock([Sequence("s1.c1", "TCTCGC-A", 11, 18),
                                        Sequence("s2.c1", "ACTGGC--", 25, 31),
                                        Sequence("s3.c2", "ACTGCCTA", 5, 13),
                                        Sequence("s4.c1", "ACT-GCTA", 58, 65)],
                                       "s1")
    self.block2 = GenomeAlignmentBlock([Sequence("s1.c2", "C------G", 21, 23),
                                        Sequence("s2.c2", "CGGTCAGG", 85, 94),
                                        Sequence("s3.c2", "-GGTC-GG", 1, 7),
                                        Sequence("s4.c3", "-GGCCAGG", 3, 11)],
                                       "s1")
    self.block3 = GenomeAlignmentBlock([Sequence("s1.c1", "CA-TAGC-G", 20, 26),
                                        Sequence("s2.c1", "CAGTAGC-G", 38, 35),
                                        Sequence("s3.c2", "C-GT-GCAG", 5, 13),
                                        Sequence("s4.c1", "CACT-GC-G", 58,
                                                 65)],
                                       "s1")
    self.block4 = GenomeAlignmentBlock([Sequence("s1.c1", "CG-TCGA", 51, 57),
                                        Sequence("s2.c1", "CGCT-GA", 38, 35),
                                        Sequence("s3.c2", "AGGTCGC", 5, 13),
                                        Sequence("s4.c1", "CGCT-GA", 58, 65)],
                                       "s1")
    # this block defines an ambiguous alignment of part of block1
    self.block1p = GenomeAlignmentBlock([Sequence("s1.c1", "GCACGCT", 15, 22),
                                         Sequence("s2.c8", "GCAC-CT", 25, 31),
                                         Sequence("s3.c8", "GC-CGCT", 5, 13),
                                         Sequence("s4.c8", "GC--GCT", 58, 65)],
                                        "s1")

    self.ga1 = GenomeAlignment([self.block1, self.block2])
    self.ga2 = GenomeAlignment([self.block1, self.block2, self.block1p])
示例#17
0
class GATestHelper(object):
    """Helper for tests involving genome alignments in concrete MAF syntax."""

    # this defines a single genome alignmnet block
    b1_hg19_seq = "atctccaagagggcataaaacac-tgagtaaacagctcttttatatgtgtttcctgga"
    b1_panTro_s = "atctccaagagggcataaaacac-tgagtaaacagctctt--atatgtgtttcctgga"
    b1_panTro_q = "99999999999999999999999-9999999999999999--9999999999999999"
    b1_tarSyr_s = "atctccaagagggctgaaaatgc-caaatga-----------tcacacgtttcctgga"
    b1_tarSyr_q = "79295966999999999999998-9999799-----------9999999999765775"
    b1_tupBel_s = "ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactgga"
    b1_tupBel_q = "79648579699867994997775679665662767577569-6998745597677632"
    b1 = "a score=28680.000000\n" +\
         "s hg19.chr22             1711 57 + 51304566 " + b1_hg19_seq + "\n" +\
         "s panTro2.chrUn          1110 59 + 58616431 " + b1_panTro_s + "\n" +\
         "q panTro2.chrUn                             " + b1_panTro_q + "\n" +\
         "i panTro2.chrUn          C 0 C 0                          " + "\n" +\
         "s tarSyr1.scaffold_5923  2859 50 -     8928 " + b1_tarSyr_s + "\n" +\
         "q tarSyr1.scaffold_5923                     " + b1_tarSyr_q + "\n" +\
         "i tarSyr1.scaffold_5923  N 0 C 0                          " + "\n" +\
         "s tupBel1.scaffold_803   33686 61 +   85889 " + b1_tupBel_s + "\n" +\
         "q tupBel1.scaffold_803                      " + b1_tupBel_q + "\n" +\
         "i tupBel1.scaffold_803   I 1 C 0                          " + "\n" +\
         "e mm4.chr6            53310102 58 + 151104725 I"
    b1_hg19 = Sequence("hg19.chr22", b1_hg19_seq, 1711, 1768, "+", 51302798)
    b1_panTro = Sequence(
        "panTro2.chrUn", b1_panTro_s, 1110, 1169, "+", 58616431 - 1169, {
            maf.QUALITY_META_KEY: b1_panTro_q,
            maf.LEFT_STATUS_KEY: "C",
            maf.LEFT_COUNT_KEY: 0,
            maf.RIGHT_STATUS_KEY: "C",
            maf.RIGHT_COUNT_KEY: 0
        })
    b1_tarSyr = Sequence(
        "tarSyr1.scaffold_5923", b1_tarSyr_s, 8928 - 2859 - 50, 8928 - 2859,
        "-", 2859, {
            maf.QUALITY_META_KEY: b1_tarSyr_q,
            maf.LEFT_STATUS_KEY: "N",
            maf.LEFT_COUNT_KEY: 0,
            maf.RIGHT_STATUS_KEY: "C",
            maf.RIGHT_COUNT_KEY: 0
        })
    b1_mm4 = UnknownSequence("mm4.chr6", 53310102, 53310102 + 58, "+",
                             151104725 - (53310102 + 58),
                             {maf.EMPTY_ALIGNMENT_STATUS_KEY: "I"})

    # this defines a second genome alignmnet block
    b2_hg19_seq = "ccttcttttaattaattttgttaagg----gatttcctctagggccactgcacgtca"
    b2_panTro_s = "ccttcttttaattaattttgttatgg----gatttcgtctagggtcactgcacatca"
    b2_panTro_q = "99999999999999999999999999----999999099999999999999999999"
    b2_tarSyr_s = "tcttcttttaattaattttattgagggattgattccttattgggccactacacatta"
    b2_tarSyr_q = "999999899978999999999999999977989997998678865952859999899"
    b2_tupBel_s = "cct--gtttaaattactgtattg-gg----gatttcctatagggccgcttctcgtcc"
    b2_tupBel_q = "666--958759455555746366-68----656846556554745443677468565"
    b2 = "a score=31725.000000\n" +\
         "s hg19.chr22             1772 53 + 51304566 " + b2_hg19_seq + "\n" +\
         "s panTro2.chrUn          1169 53 + 58616431 " + b2_panTro_s + "\n" +\
         "q panTro2.chrUn                             " + b2_panTro_q + "\n" +\
         "i panTro2.chrUn          C 0 C 0                          " + "\n" +\
         "s tarSyr1.scaffold_5923  2909 124 -    8928 " + b2_tarSyr_s + "\n" +\
         "q tarSyr1.scaffold_5923                     " + b2_tarSyr_q + "\n" +\
         "i tarSyr1.scaffold_5923  C 0 N 0                          " + "\n" +\
         "s tupBel1.scaffold_803   33747 113 +  85889 " + b2_tupBel_s + "\n" +\
         "q tupBel1.scaffold_803                      " + b2_tupBel_q + "\n" +\
         "i tupBel1.scaffold_803 C 0 N 0              "

    # define a maf 'file' by stitching the two blocks together
    maf1 = b1 + "\n\n" + b2

    # abstract repr. of some parts of the above data.
    b2_hg19 = Sequence("hg19.chr22", b2_hg19_seq, 1772, 1825, "+", 51302741)
    b2_panTro = Sequence(
        "panTro2.chrUn", b2_panTro_s, 1169, 1169 + 53, "+",
        58616431 - (1169 + 53), {
            maf.QUALITY_META_KEY: b2_panTro_q,
            maf.LEFT_STATUS_KEY: "C",
            maf.LEFT_COUNT_KEY: 0,
            maf.RIGHT_STATUS_KEY: "C",
            maf.RIGHT_COUNT_KEY: 0
        })
    b2_tarSyr = Sequence(
        "tarSyr1.scaffold_5923", b2_tarSyr_s, 8928 - 2909 - 124, 8928 - 2909,
        "-", 2909, {
            maf.QUALITY_META_KEY: b2_tarSyr_q,
            maf.LEFT_STATUS_KEY: "C",
            maf.LEFT_COUNT_KEY: 0,
            maf.RIGHT_STATUS_KEY: "N",
            maf.RIGHT_COUNT_KEY: 0
        })
示例#18
0
文件: read.py 项目: pjuren/pyokit
 def reverse_complement(self, is_RNA=None):
   """
     Reverse complement this read in-place.
   """
   Sequence.reverseComplement(self, is_RNA)
   self.seq_qual = self.seq_qual[::-1]
示例#19
0
def repeat_masker_alignment_iterator(fn, index_friendly=True, verbose=False):
    """
  Iterator for repeat masker alignment files; yields multiple alignment objects.

  Iterate over a file/stream of full repeat alignments in the repeatmasker
  format. Briefly, this format is as follows: each record (alignment) begins
  with a header line (see _rm_parse_header_line documentation for details of
  header format), followed by the alignment itself (example below) and finally
  a set of key-value meta-data pairs.

  The actual alignment looks like this::

    chr1               11 CCCTGGAGATTCTTATT--AGTGATTTGGGCT 41
                             ii        v   -- v  i i    v
    C MER5B#DNA/hAT    10 CCCCAGAGATTCTGATTTAATTGGTCTGGGGT 42

    chr1               42 GACTG 47
                           v
    C MER5B#DNA/hAT    43 CACTG 48

  The 'C' indicates that its the reverse complement of the consensus. The
  central string gives information about matches; "-" indicates an
  insertion/deletion, "i" a transition (G<->A, C<->T) and "v" a transversion
  (all other substitutions).

  :param fh:             filename or stream-like object to read from.
  :param index_friendly: if True, we will ensure the file/stream
                         position is before the start of the record when we
                         yield it; this requires the ability to seek within
                         the stream though, so if iterating over a
                         stream wtihout that ability, you'll have to set this
                         to false. Further, this will disable buffering for
                         the file, to ensure file.tell() behaves correctly,
                         so a performance hit will be incurred.
  :param verbose:        if true, output progress messages to stderr.
  """
    # step 1 -- build our iterator for the stream..
    try:
        fh = open(fn)
    except (TypeError):
        fh = fn
    iterable = fh
    if index_friendly:
        iterable = iter(fh.readline, '')

    # build progress indicator, if we want one and we're able to
    if verbose:
        try:
            m_fn = ": " + fh.name
        except TypeError:
            m_fn = ""
        try:
            current = fh.tell()
            fh.seek(0, 2)
            total_progress = fh.tell()
            fh.seek(current)
            pind = ProgressIndicator(
                totalToDo=total_progress,
                messagePrefix="completed",
                messageSuffix="of processing repeat-masker "
                "alignment file" + m_fn)
        except IOError:
            pind = None

    old_fh_pos = None
    new_fh_pos = fh.tell()

    s1 = None
    s2 = None
    s1_name = None
    s2_name = None
    s1_start = None
    s1_end = None
    s2_start = None
    s2_end = None
    meta_data = None
    alignment_line_counter = 0
    alig_l_space = 0
    prev_seq_len = 0
    rev_comp_match = None
    remaining_repeat = None
    remaining_genomic = None

    for line in iterable:
        if verbose and pind is not None:
            pind.done = fh.tell()
            pind.showProgress()

        if index_friendly:
            old_fh_pos = new_fh_pos
            new_fh_pos = fh.tell()
        line = line.rstrip()
        if line.lstrip() == "" and alignment_line_counter % 3 != 1:
            continue

        s_pres_split = re.split(r'(\s+)', line)
        parts = [x for x in s_pres_split if not (x.isspace() or x == "")]

        n = len(parts)
        for i in REPEATMASKER_FIELDS_TO_TRIM:
            if n >= i + 1:
                parts[i] = parts[i].strip()

        # decide what to do with this line -- is it a header line, part of the
        # alignment or a meta-data key-value line
        if alignment_line_counter % 3 == 1:
            if (REPEATMASKER_VALIDATE_MUTATIONS
                    and not _rm_is_valid_annotation_line(line)):
                raise IOError("invalid mutation line: " + line)
            l_space = _rm_compute_leading_space(s_pres_split) - alig_l_space
            pad_right = prev_seq_len - (l_space + len(line.strip()))
            meta_data[ANNOTATION_KEY] += ((' ' * l_space) + line.strip() +
                                          (' ' * pad_right))
            alignment_line_counter += 1
        elif _rm_is_header_line(parts, n):
            if not (s1 is None and s2 is None and meta_data is None):
                if ANNOTATION_KEY in meta_data:
                    meta_data[ANNOTATION_KEY] = meta_data[
                        ANNOTATION_KEY].rstrip()
                if index_friendly:
                    fh.seek(old_fh_pos)
                ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+",
                               remaining_genomic)
                s2s = "-" if rev_comp_match else "+"
                ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s,
                               remaining_repeat)
                yield PairwiseAlignment(ss1, ss2, meta_data)
                if index_friendly:
                    fh.seek(new_fh_pos)
            meta_data = {}
            s1 = ""
            s2 = ""
            s1_name, s2_name = _rm_get_names_from_header(parts)
            s1_start, s1_end = _rm_get_reference_coords_from_header(parts)
            s2_start, s2_end = _rm_get_repeat_coords_from_header(parts)
            rev_comp_match = _rm_is_reverse_comp_match(parts)
            remaining_repeat = _rm_get_remaining_repeat_from_header(parts)
            remaining_genomic = _rm_get_remaining_genomic_from_header(parts)

            _rm_parse_header_line(parts, meta_data)
            alignment_line_counter = 0
        elif _rm_is_alignment_line(parts, s1_name, s2_name):
            alignment_line_counter += 1
            name, seq = _rm_extract_sequence_and_name(parts, s1_name, s2_name)
            if name == s1_name:
                s1 += seq
            elif name == s2_name:
                s2 += seq
            alig_l_space = _rm_compute_leading_space_alig(s_pres_split, seq)
            prev_seq_len = len(seq)
        else:
            k, v = _rm_parse_meta_line(parts)
            meta_data[k] = v
    if index_friendly:
        fh.seek(old_fh_pos)
    ss1 = Sequence(s1_name, s1, s1_start, s1_end, "+", remaining_genomic)
    s2s = "-" if rev_comp_match else "+"
    ss2 = Sequence(s2_name, s2, s2_start, s2_end, s2s, remaining_repeat)
    yield PairwiseAlignment(ss1, ss2, meta_data)
    if index_friendly:
        fh.seek(new_fh_pos)