Пример #1
0
 def _prepResults(self):
     #print "Query", self.state['queryName'], self.state['queryLen']
     #print "HitINFO:", self.hitInfo
     #print "HitRecord", self.hitRecord
     #print "HitAlign:", self.alignMap        
     if ( len(self.hitInfo) == 0 and not self.state['eof'] ):
         alignment = MultipleSeqAlignment( [], self.alphabet)
         self.outList.append( alignment )
     
     for hit in self.hitInfo:
         for domain in self.hitRecord[ hit ]:
             queryStr  = "".join( self.alignMap[ hit ][ domain ][ 'query' ] )
             targetStr = "".join( self.alignMap[ hit ][ domain ][ 'target' ] )
             
             query = SeqRecord(Seq(queryStr, self.alphabet),
                 id = self.state['queryName'], description = self.state.get( 'desc', "" ),
                 annotations = {})
                 
             target = SeqRecord(Seq(targetStr, self.alphabet),
                 id = hit,
                 annotations = {})
             alignment = HMMERAlign( [query,target], self.alphabet)
             alignment._annotations = self.hitRecord[ hit ][ domain ]
             alignment._annotations[ 'seqName' ] = self.state['queryName']
             alignment._annotations[ 'hmmName' ] = hit
             
             
             self.outList.append( alignment )
Пример #2
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            # Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = OrderedDict(
        )  # Really only need an OrderedSet, but python lacks this
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " +
                                     "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids[id] = True
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip(
                    )  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids.keys()
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" %
                    (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id,
                                   name=name,
                                   description=id,
                                   annotations={"accession": name})
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Пример #3
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            # Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier "
                                      + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError("Found %i records in this alignment, told to expect %i"
                                 % (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id, name=name, description=id,
                                   annotations={"accession": name})
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Пример #4
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect")
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()

        q = _extract_alignment_region(query_seq, query_tags)
        if tool in ["TFASTX"] and len(match_seq) == len(q):
            m = match_seq
            # Quick hack until I can work out how -, * and / characters
            # and the apparent mix of aa and bp coordinates works.
        else:
            m = _extract_alignment_region(match_seq, match_tags)
        if len(q) != len(m):
            message = """Darn... amino acids vs nucleotide coordinates?
            tool: {0}
            query_seq: {1}
            query_tags: {2}
            {3} length: {4}
            match_seq: {5}
            match_tags: {6}
            {7} length: {8}
            handle.name: {9}
            """.format(tool, query_seq, query_tags, q, len(q), match_seq,
                       match_tags, m, len(m), handle.name)
            raise ValueError(message)

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        # TODO - Introduce an annotated alignment class?
        # See also Bio/AlignIO/MafIO.py for same requirement.
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - What if a specific alphabet has been requested?
        # TODO - Use an IUPAC alphabet?
        # TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        # Match
        # =====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        # This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #5
0
def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet):
    """
    Iterates over lines in a MAF file-like object (handle), yielding
    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
    species names
    """
    in_a_bundle = False

    annotations = []
    records = []

    while True:
        # allows parsing of the last bundle without duplicating code
        try:
            line = next(handle)
        except StopIteration:
            line = ""

        if in_a_bundle:
            if line.startswith("s"):
                # add a SeqRecord to the bundle
                line_split = line.strip().split()

                if len(line_split) != 7:
                    raise ValueError(
                        "Error parsing alignment - 's' line must have 7 fields"
                    )

                # convert MAF-style +/- strand to biopython-type 1/-1
                if line_split[4] == "+":
                    strand = 1
                elif line_split[4] == "-":
                    strand = -1
                else:
                    strand = 1

                # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
                anno = {
                    "start": int(line_split[2]),
                    "size": int(line_split[3]),
                    "strand": strand,
                    "srcSize": int(line_split[5])
                }

                sequence = line_split[6]

                # interpret a dot/period to mean same the first sequence
                if "." in sequence:
                    if not records:
                        raise ValueError(
                            "Found dot/period in first sequence of alignment")

                    ref = str(records[0].seq)
                    new = []

                    for (s, r) in zip(sequence, ref):
                        new.append(r if s == "." else s)

                    sequence = "".join(new)

                records.append(
                    SeqRecord(Seq(sequence, alphabet),
                              id=line_split[1],
                              name=line_split[1],
                              description="",
                              annotations=anno))
            elif line.startswith("i"):
                # TODO: information about what is in the aligned species DNA before
                # and after the immediately preceding "s" line
                pass
            elif line.startswith("e"):
                # TODO: information about the size of the gap between the alignments
                # that span the current block
                pass
            elif line.startswith("q"):
                # TODO: quality of each aligned base for the species.
                # Need to find documentation on this, looks like ASCII 0-9 or gap?
                # Can then store in each SeqRecord's .letter_annotations dictionary,
                # perhaps as the raw string or turned into integers / None for gap?
                pass
            elif not line.strip():
                # end a bundle of records
                if seq_count is not None:
                    assert len(records) == seq_count

                alignment = MultipleSeqAlignment(records, alphabet)
                # TODO - Introduce an annotated alignment class?
                # See also Bio/AlignIO/FastaIO.py for same requirement.
                # For now, store the annotation a new private property:
                alignment._annotations = annotations

                yield alignment

                in_a_bundle = False

                annotations = []
                records = []
            else:
                raise ValueError(
                    "Error parsing alignment - unexpected line:\n%s" %
                    (line, ))
        elif line.startswith("a"):
            # start a bundle of records
            in_a_bundle = True

            if len(line.strip().split()[1:]) != line.count("="):
                raise ValueError(
                    "Error parsing alignment - invalid key in 'a' line")

            annotations = dict(
                [x.split("=") for x in line.strip().split()[1:]])
        elif line.startswith("#"):
            # ignore comments
            pass
        elif not line:
            break
Пример #6
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r"
                             % (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect")
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()

        q = _extract_alignment_region(query_seq, query_tags)
        if tool in ["TFASTX"] and len(match_seq) == len(q):
            m = match_seq
            # Quick hack until I can work out how -, * and / characters
            # and the apparent mix of aa and bp coordinates works.
        else:
            m = _extract_alignment_region(match_seq, match_tags)
        if len(q) != len(m):
            message = """Darn... amino acids vs nucleotide coordinates?
            tool: {0}
            query_seq: {1}
            query_tags: {2}
            {3} length: {4}
            match_seq: {5}
            match_tags: {6}
            {7} length: {8}
            handle.name: {9}
            """.format(tool, query_seq, query_tags, q, len(q), match_seq, match_tags, m, len(m), handle.name)
            raise ValueError(message)

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        # TODO - Introduce an annotated alignment class?
        # See also Bio/AlignIO/MafIO.py for same requirement.
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(Seq(q, alphabet),
                           id=query_id,
                           name="query",
                           description=query_descr,
                           annotations={"original_length": int(query_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - What if a specific alphabet has been requested?
        # TODO - Use an IUPAC alphabet?
        # TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        # Match
        # =====
        record = SeqRecord(Seq(m, alphabet),
                           id=match_id,
                           name="match",
                           description=match_descr,
                           annotations={"original_length": int(match_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        # This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #7
0
def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
    """Alignment iterator for the FASTA tool's pairwise alignment output.

    This is for reading the pairwise alignments output by Bill Pearson's
    FASTA program when called with the -m 10 command line option for machine
    readable output.  For more details about the FASTA tools, see the website
    http://fasta.bioch.virginia.edu/ and the paper:

         W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448

    This class is intended to be used via the Bio.AlignIO.parse() function
    by specifying the format as "fasta-m10" as shown in the following code:

        from Bio import AlignIO
        handle = ...
        for a in AlignIO.parse(handle, "fasta-m10"):
            assert len(a) == 2, "Should be pairwise!"
            print "Alignment length %i" % a.get_alignment_length()
            for record in a:
                print record.seq, record.name, record.id

    Note that this is not a full blown parser for all the information
    in the FASTA output - for example, most of the header and all of the
    footer is ignored.  Also, the alignments are not batched according to
    the input queries.

    Also note that there can be up to about 30 letters of flanking region
    included in the raw FASTA output as contextual information.  This is NOT
    part of the alignment itself, and is not included in the resulting
    MultipleSeqAlignment objects returned.
    """
    if alphabet is None:
        alphabet = single_letter_alphabet

    state_PREAMBLE = -1
    state_NONE = 0
    state_QUERY_HEADER = 1
    state_ALIGN_HEADER = 2
    state_ALIGN_QUERY = 3
    state_ALIGN_MATCH = 4
    state_ALIGN_CONS = 5

    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" \
                             % (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  #Just for printing len(q) in debug below
        m = "?"  #Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordindates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError, err:
            print "Darn... amino acids vs nucleotide coordinates?"
            print tool
            print query_seq
            print query_tags
            print q, len(q)
            print match_seq
            print match_tags
            print m, len(m)
            print handle.name
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.iteritems():
            alignment._annotations[key] = value
        for key, value in align_tags.iteritems():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #8
0
def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
    """Alignment iterator for the FASTA tool's pairwise alignment output.

    This is for reading the pairwise alignments output by Bill Pearson's
    FASTA program when called with the -m 10 command line option for machine
    readable output.  For more details about the FASTA tools, see the website
    http://fasta.bioch.virginia.edu/ and the paper:

         W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448

    This class is intended to be used via the Bio.AlignIO.parse() function
    by specifying the format as "fasta-m10" as shown in the following code:

        from Bio import AlignIO
        handle = ...
        for a in AlignIO.parse(handle, "fasta-m10"):
            assert len(a) == 2, "Should be pairwise!"
            print "Alignment length %i" % a.get_alignment_length()
            for record in a:
                print record.seq, record.name, record.id

    Note that this is not a full blown parser for all the information
    in the FASTA output - for example, most of the header and all of the
    footer is ignored.  Also, the alignments are not batched according to
    the input queries.

    Also note that there can be up to about 30 letters of flanking region
    included in the raw FASTA output as contextual information.  This is NOT
    part of the alignment itself, and is not included in the resulting
    MultipleSeqAlignment objects returned.
    """
    if alphabet is None:
        alphabet = single_letter_alphabet

    state_PREAMBLE = -1
    state_NONE = 0
    state_QUERY_HEADER = 1
    state_ALIGN_HEADER = 2
    state_ALIGN_QUERY = 3
    state_ALIGN_MATCH = 4
    state_ALIGN_CONS = 5

    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" % (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                # Quick hack until I can work out how -, * and / characters
                # and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError, err:
            print "Darn... amino acids vs nucleotide coordinates?"
            print tool
            print query_seq
            print query_tags
            print q, len(q)
            print match_seq
            print match_tags
            print m, len(m)
            print handle.name
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        # TODO - Introduce an annotated alignment class?
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.iteritems():
            alignment._annotations[key] = value
        for key, value in align_tags.iteritems():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])},
        )
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - What if a specific alphabet has been requested?
        # TODO - Use an IUPAC alphabet?
        # TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        # Match
        # =====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])},
        )
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        # This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #9
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            # Empty file - just give up.
            raise StopIteration
        if line.strip() != "# STOCKHOLM 1.0":
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = {}  # Really only need an OrderedSet, but python lacks this
        gs = {}
        gr = {}
        gf = {}
        gc = {}
        passed_end_alignment = False
        while True:
            line = handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == "# STOCKHOLM 1.0":
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError(
                        "Could not split line into identifier and sequence:\n"
                        + line)
                seq_id, seq = parts
                if seq_id not in ids:
                    ids[seq_id] = True
                seqs.setdefault(seq_id, "")
                seqs[seq_id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == "#=GC ":
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    feature, text = line[5:].strip().split(None, 2)
                    if feature not in gc:
                        gc[feature] = ""
                    gc[feature] += text.strip()  # append to any previous entry
                    # Might be interleaved blocks, so can't check length yet
                elif line[:5] == "#=GS ":
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    try:
                        seq_id, feature, text = line[5:].strip().split(None, 2)
                    except ValueError:
                        # Free text can sometimes be empty, which a one line split throws an error for.
                        # See https://github.com/biopython/biopython/issues/2982 for more details
                        seq_id, feature = line[5:].strip().split(None, 1)
                        text = ""
                    # if seq_id not in ids:
                    #    ids.append(seq_id)
                    if seq_id not in gs:
                        gs[seq_id] = {}
                    if feature not in gs[seq_id]:
                        gs[seq_id][feature] = [text]
                    else:
                        gs[seq_id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    seq_id, feature, text = line[5:].strip().split(None, 2)
                    # if seq_id not in ids:
                    #    ids.append(seq_id)
                    if seq_id not in gr:
                        gr[seq_id] = {}
                    if feature not in gr[seq_id]:
                        gr[seq_id][feature] = ""
                    gr[seq_id][feature] += text.strip(
                    )  # append to any previous entry
                    # Might be interleaved blocks, so can't check length yet
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids.keys()
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if (self.records_per_alignment is not None
                    and self.records_per_alignment != len(ids)):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" %
                    (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for seq_id in ids:
                seq = seqs[seq_id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(seq_id)
                record = SeqRecord(
                    Seq(seq),
                    id=seq_id,
                    name=name,
                    description=seq_id,
                    annotations={"accession": name},
                )
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(seq_id, record)
                records.append(record)
            for k, v in gc.items():
                if len(v) != alignment_length:
                    raise ValueError("%s length %i, expected %i" %
                                     (k, len(v), alignment_length))
            alignment = MultipleSeqAlignment(records)

            for k, v in sorted(gc.items()):
                if k in self.pfam_gc_mapping:
                    alignment.column_annotations[self.pfam_gc_mapping[k]] = v
                elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping:
                    alignment.column_annotations[self.pfam_gr_mapping[
                        k[:-5]]] = v
                else:
                    # Ignore it?
                    alignment.column_annotations["GC:" + k] = v

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Пример #10
0
    def next(self):
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an MultipleSeqAlignment object containing two rows.
        """
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        if line.startswith("#"):
            #Skip the file header before the alignments.  e.g.
            line = self._skip_file_header(line)
        while ">>>" in line and not line.startswith(">>>"):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
            #Now should be some alignments, but if not we move onto the next query
        if not line:
            #End of file
            raise StopIteration
        if ">>><<<" in line:
            #Reached the end of the alignments, no need to read the footer...
            raise StopIteration

        #Should start >>... and not >>>...
        assert line[0:2] == ">>" and not line[2] == ">", line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match ID line, then more tags.
        #e.g.
        """
        >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
        ; fa_frame: f
        ; fa_initn:  52
        ; fa_init1:  52
        ; fa_opt:  70
        ; fa_z-score: 105.5
        ; fa_bits: 27.5
        ; fa_expect:  0.082
        ; sw_score: 70
        ; sw_ident: 0.279
        ; sw_sim: 0.651
        ; sw_overlap: 43
        """
        if (not line[0:2] == ">>") or line[0:3] == ">>>":
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line[0:2] == "; "

        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split(None, 1)[0])

        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence (with leading flanking region)
        while not line[0] == ">":
            query_seq_parts.append(line.strip())
            line = handle.readline()

        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError(
                "Expected line starting '>' and ending '..', got '%s'" %
                repr(line))
        assert match_descr.startswith(line[1:].split(None, 1)[0])

        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence with flanking region...
        #but before that, since FASTA 35.4.1 there can be an consensus here,
        """
        ; al_cons:
        .::. : :. ---.  :: :. . :  ..-:::-:  :.:  ..:...: 
        etc
        """
        while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
            match_seq_parts.append(line.strip())
            line = handle.readline()
        if line[0:2] == "; ":
            assert line.strip() == "; al_cons:"
            align_consensus_parts = []
            line = handle.readline()
            while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
                align_consensus_parts.append(line.strip())
                line = handle.readline()
            #If we do anything with this in future, must remove any flanking region.
            align_consensus = "".join(align_consensus_parts)
            del align_consensus_parts
            assert not line[0:2] == "; "
        else:
            align_consensus = None
        assert (line[0] == ">" or ">>>" in line)
        self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(
            query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(
            match_seq, match_annotation)
        #How can we do this for the (optional) consensus?

        #The "sq_offset" values can be specified with the -X command line option.
        #They appear to just shift the origin used in the calculation of the coordinates.

        if len(query_align_seq) != len(match_align_seq):
            raise ValueError(
                "Problem parsing the alignment sequence coordinates, "
                "following should be the same length but are not:\n"
                "%s - len %i\n%s - len %i" %
                (query_align_seq, len(query_align_seq), match_align_seq,
                 len(match_align_seq)))
        if "sw_overlap" in alignment_annotation:
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq):
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alphabet = self.alphabet
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems():
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(query_align_seq, alphabet),
            id=self._query_descr.split(None, 1)[0].strip(","),
            name="query",
            description=self._query_descr,
            annotations={"original_length": int(query_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_annotation["al_start"])
        record._al_stop = int(query_annotation["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_annotation:
            if query_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in query_align_seq:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(match_align_seq, alphabet),
            id=match_descr.split(None, 1)[0].strip(","),
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_annotation["al_start"])
        record._al_stop = int(match_annotation["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_annotation:
            if match_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in match_align_seq:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #11
0
    def next(self):
        """Reads from the handle to construct and return the next alignment.

        This returns the pairwise alignment of query and match/library
        sequences as an MultipleSeqAlignment object containing two rows.
        """
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:      
            line = handle.readline()
        if not line:
            return None

        if line.startswith("#"):
            #Skip the file header before the alignments.  e.g.
            line = self._skip_file_header(line)
        while ">>>" in line and not line.startswith(">>>"):
            #Moved onto the next query sequence!
            self._query_descr = ""
            self._query_header_annotation = {}
            #Read in the query header
            line = self._parse_query_header(line)
            #Now should be some alignments, but if not we move onto the next query
        if not line:
            #End of file
            return None
        if ">>><<<" in line:
            #Reached the end of the alignments, no need to read the footer...
            return None


        #Should start >>... and not >>>...
        assert line[0:2] == ">>" and not line[2] == ">", line

        query_seq_parts, match_seq_parts = [], []
        query_annotation, match_annotation = {}, {}
        match_descr = ""
        alignment_annotation = {}

        #This should be followed by the target match ID line, then more tags.
        #e.g.
        """
        >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
        ; fa_frame: f
        ; fa_initn:  52
        ; fa_init1:  52
        ; fa_opt:  70
        ; fa_z-score: 105.5
        ; fa_bits: 27.5
        ; fa_expect:  0.082
        ; sw_score: 70
        ; sw_ident: 0.279
        ; sw_sim: 0.651
        ; sw_overlap: 43
        """
        if (not line[0:2] == ">>") or line[0:3] == ">>>":
            raise ValueError("Expected target line starting '>>'")
        match_descr = line[2:].strip()
        #Handle the following "alignment hit" tagged data, e.g.
        line = handle.readline()
        line = self._parse_tag_section(line, alignment_annotation)
        assert not line[0:2] == "; "
        
        #Then we have the alignment numbers and sequence for the query
        """
        >gi|10955265| ..
        ; sq_len: 346
        ; sq_offset: 1
        ; sq_type: p
        ; al_start: 197
        ; al_stop: 238
        ; al_display_start: 167
        DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
        QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
        GEYFTENKPKYIIREIHQET
        """
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..'")
        assert self._query_descr.startswith(line[1:].split(None,1)[0])
        
        #Handle the following "query alignment" tagged data
        line = handle.readline()
        line = self._parse_tag_section(line, query_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence (with leading flanking region)
        while not line[0] == ">":
            query_seq_parts.append(line.strip())
            line = handle.readline()
        
        #Handle the following "match alignment" data
        """
        >gi|152973545|ref|YP_001338596.1| ..
        ; sq_len: 242
        ; sq_type: p
        ; al_start: 52
        ; al_stop: 94
        ; al_display_start: 22
        IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
        RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
        QDFAFTRKMRREARQVEQSW
        """
        #Match identifier
        if not (line[0] == ">" and line.strip().endswith("..")):
            raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
        assert match_descr.startswith(line[1:].split(None,1)[0])
        
        #Tagged data,
        line = handle.readline()
        line = self._parse_tag_section(line, match_annotation)
        assert not line[0:2] == "; "

        #Now should have the aligned query sequence with flanking region...
        #but before that, since FASTA 35.4.1 there can be an consensus here,
        """
        ; al_cons:
        .::. : :. ---.  :: :. . :  ..-:::-:  :.:  ..:...: 
        etc
        """
        while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
            match_seq_parts.append(line.strip())
            line = handle.readline()
        if line[0:2] == "; ":
            assert line.strip() == "; al_cons:"
            align_consensus_parts = []
            line = handle.readline()
            while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
                align_consensus_parts.append(line.strip())
                line = handle.readline()
            #If we do anything with this in future, must remove any flanking region.
            align_consensus = "".join(align_consensus_parts)
            del align_consensus_parts
            assert not line[0:2] == "; "
        else:
            align_consensus = None
        assert (line[0] == ">" or ">>>" in line)
        self._header = line

        #We built a list of strings and then joined them because
        #its faster than appending to a string.
        query_seq = "".join(query_seq_parts)
        match_seq = "".join(match_seq_parts)
        del query_seq_parts, match_seq_parts
        #Note, query_seq and match_seq will usually be of different lengths, apparently
        #because in the m10 format leading gaps are added but not trailing gaps!

        #Remove the flanking regions,
        query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
        match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
        #How can we do this for the (optional) consensus?

        #The "sq_offset" values can be specified with the -X command line option.
        #They appear to just shift the origin used in the calculation of the coordinates.
        
        if len(query_align_seq) != len(match_align_seq):
            raise ValueError("Problem parsing the alignment sequence coordinates, " 
                             "following should be the same length but are not:\n"
                             "%s - len %i\n%s - len %i" % (query_align_seq,
                                                           len(query_align_seq),
                                                           match_align_seq,
                                                           len(match_align_seq)))
        if "sw_overlap" in alignment_annotation:
            if int(alignment_annotation["sw_overlap"]) != len(query_align_seq):
                raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
                                 % (alignment_annotation["sw_overlap"],
                                    len(query_align_seq)))

        #TODO - Look at the "sq_type" to assign a sensible alphabet?
        alphabet = self.alphabet
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}
        
        #Want to record both the query header tags, and the alignment tags.
        for key, value in self._query_header_annotation.iteritems():
            alignment._annotations[key] = value
        for key, value in alignment_annotation.iteritems():
            alignment._annotations[key] = value
        
        #Query
        #=====
        record = SeqRecord(Seq(query_align_seq, alphabet),
                           id = self._query_descr.split(None,1)[0].strip(","),
                           name = "query",
                           description = self._query_descr,
                           annotations = {"original_length" : int(query_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_annotation["al_start"])
        record._al_stop = int(query_annotation["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_annotation:
            if query_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in query_align_seq:
            if not hasattr(record.seq.alphabet,"gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(Seq(match_align_seq, alphabet),
                           id = match_descr.split(None,1)[0].strip(","),
                           name = "match",
                           description = match_descr,
                           annotations = {"original_length" : int(match_annotation["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_annotation["al_start"])
        record._al_stop = int(match_annotation["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_annotation:
            if match_annotation["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_annotation["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in match_align_seq:
            if not hasattr(record.seq.alphabet,"gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Пример #12
0
def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet):
    """
    Iterates over lines in a MAF file-like object (handle), yielding
    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
    species names
    """
    in_a_bundle = False

    annotations = []
    records = []

    while True:
        # allows parsing of the last bundle without duplicating code
        try:
            line = next(handle)
        except StopIteration:
            line = ""

        if in_a_bundle:
            if line.startswith("s"):
                # add a SeqRecord to the bundle
                line_split = line.strip().split()

                if len(line_split) != 7:
                    raise ValueError("Error parsing alignment - 's' line must have 7 fields")

                # convert MAF-style +/- strand to biopython-type 1/-1
                if line_split[4] == "+":
                    strand = 1
                elif line_split[4] == "-":
                    strand = -1
                else:
                    # TODO: issue warning, set to 0?
                    strand = 1

                # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
                anno = {"start": int(line_split[2]),
                        "size": int(line_split[3]),
                        "strand": strand,
                        "srcSize": int(line_split[5])}

                sequence = line_split[6]

                # interpret a dot/period to mean the same as the first sequence
                if "." in sequence:
                    if not records:
                        raise ValueError("Found dot/period in first sequence of alignment")

                    ref = str(records[0].seq)
                    new = []

                    for (letter, ref_letter) in zip(sequence, ref):
                        new.append(ref_letter if letter == "." else letter)

                    sequence = "".join(new)

                records.append(SeqRecord(Seq(sequence, alphabet),
                                         id=line_split[1],
                                         name=line_split[1],
                                         description="",
                                         annotations=anno))
            elif line.startswith("i"):
                # TODO: information about what is in the aligned species DNA before
                # and after the immediately preceding "s" line
                pass
            elif line.startswith("e"):
                # TODO: information about the size of the gap between the alignments
                # that span the current block
                pass
            elif line.startswith("q"):
                # TODO: quality of each aligned base for the species.
                # Need to find documentation on this, looks like ASCII 0-9 or gap?
                # Can then store in each SeqRecord's .letter_annotations dictionary,
                # perhaps as the raw string or turned into integers / None for gap?
                pass
            elif line.startswith("#"):
                # ignore comments
                # (not sure whether comments
                # are in the maf specification, though)
                pass
            elif not line.strip():
                # end a bundle of records
                if seq_count is not None:
                    assert len(records) == seq_count

                alignment = MultipleSeqAlignment(records, alphabet)
                # TODO - Introduce an annotated alignment class?
                # See also Bio/AlignIO/FastaIO.py for same requirement.
                # For now, store the annotation a new private property:
                alignment._annotations = annotations

                yield alignment

                in_a_bundle = False

                annotations = []
                records = []
            else:
                raise ValueError("Error parsing alignment - unexpected line:\n%s" % (line,))
        elif line.startswith("a"):
            # start a bundle of records
            in_a_bundle = True
            annot_strings = line.strip().split()[1:]
            if len(annot_strings) != line.count("="):
                raise ValueError("Error parsing alignment - invalid key in 'a' line")
            annotations = dict([a_string.split("=") for a_string in annot_strings])
        elif line.startswith("#"):
            # ignore comments
            pass
        elif not line:
            break
Пример #13
0
def MafIterator(handle, seq_count = None, alphabet = single_letter_alphabet):
    """
    Iterates over lines in a MAF file-like object (handle), yielding
    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to 
    species names
    """

    in_a_bundle = False
    
    annotations = []
    records = []
    
    while True:
        # allows parsing of the last bundle without duplicating code
        try:
            line = handle.next()
        except StopIteration:
            line = ""
        
        if in_a_bundle:
            if line.startswith("s"):
                # add a SeqRecord to the bundle
                line_split = line.strip().split()

                if len(line_split) <> 7:
                    raise ValueError("Error parsing alignment - 's' line must have 7 fields")

                # convert MAF-style +/- strand to biopython-stype +1/-1
                if line_split[4] == "+":
                    strand = "+1"
                elif line_split[4] == "-":
                    strand = "-1"
                else:
                    strand = "+1"

                # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
                anno = {"start": int(line_split[2]),
                        "size": int(line_split[3]),
                        "strand": strand,
                        "srcSize": int(line_split[5])}
                        
                sequence = line_split[6]
                
                # interpret a dot/period to mean same the first sequence
                if "." in sequence:
                    if not records:
                        raise ValueError("Found dot/period in first sequence of alignment")
                        
                    ref = str(records[0].seq)
                    new = []
                    
                    for (s, r) in zip(sequence, ref):
                        new.append(r if s == "." else s)
                             
                    sequence = "".join(new)
                    
                records.append(SeqRecord(Seq(sequence, alphabet),
                               id = line_split[1],
                               name = line_split[1],
                               description = "",
                               annotations = anno))
            elif line.startswith("e") or \
                 line.startswith("i") or \
                 line.startswith("q"):
                # not implemented
                pass
            elif not line.strip():
                # end a bundle of records
                if seq_count is not None:
                    assert len(records) == seq_count
                    
                alignment = MultipleSeqAlignment(records, alphabet)
                #TODO - Introduce an annotated alignment class?
                #See also Bio/AlignIO/FastaIO.py for same requirement.        
                #For now, store the annotation a new private property:
                alignment._annotations = annotations
                
                yield alignment
                
                in_a_bundle = False
                
                annotations = []
                records = []
            else:
                raise ValueError("Error parsing alignment - unexpected line:\n%s" % (line,))
        elif line.startswith("a"):
            # start a bundle of records
            in_a_bundle = True
            
            if len(line.strip().split()[1:]) != line.count("="):
                raise ValueError("Error parsing alignment - invalid key in 'a' line")

            annotations = dict([x.split("=") for x in line.strip().split()[1:]])
        elif line.startswith("#"):
            # ignore comments
            pass
        elif not line:
            break
Пример #14
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect")
        tool = global_tags.get("tool", "").upper()

        q = _extract_alignment_region(query_seq, query_tags)
        if tool in ["TFASTX"] and len(match_seq) == len(q):
            m = match_seq
            # Quick hack until I can work out how -, * and / characters
            # and the apparent mix of aa and bp coordinates works.
        else:
            m = _extract_alignment_region(match_seq, match_tags)
        if len(q) != len(m):
            raise ValueError(f"""\
Darn... amino acids vs nucleotide coordinates?
tool: {tool}
query_seq: {query_seq}
query_tags: {query_tags}
{q} length: {len(q)}
match_seq: {match_seq}
match_tags: {match_tags}
{m} length: {len(m)}
handle.name: {handle.name}
""")

        alignment = MultipleSeqAlignment([])

        # TODO - Introduce an annotated alignment class?
        # See also Bio/AlignIO/MafIO.py for same requirement.
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(
            Seq(q),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])},
        )
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - Can FASTA output RNA?
        if "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.annotations["molecule_type"] = "DNA"
            elif query_tags["sq_type"] == "p":
                record.annotations["molecule_type"] = "protein"

        # Match
        # =====
        record = SeqRecord(
            Seq(m),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])},
        )
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        if "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.annotations["molecule_type"] = "DNA"
            elif match_tags["sq_type"] == "p":
                record.annotations["molecule_type"] = "protein"

        return alignment