Пример #1
0
 def create_alignment(self, line):
     """Parse one line of FASTA output and return an Alignment object."""
     columns = line.split()
     assert len(columns) == 13
     annotations = {}
     annotations["program"] = self._program
     annotations["database"] = self._database
     if self._query_id is not None:
         assert columns[0] == self._query_id
     query_id = columns[0]
     target_id = columns[1]
     percentage_identity = float(columns[2])
     alignment_length = int(columns[3])
     mismatches = int(columns[4])
     matches = alignment_length - mismatches
     difference = abs(100 * matches / alignment_length -
                      percentage_identity)
     assert difference < 0.015
     gap_opens = int(columns[5])
     query_start = int(columns[6]) - 1
     query_end = int(columns[7])
     target_start = int(columns[8]) - 1
     target_end = int(columns[9])
     annotations["mismatches"] = mismatches
     annotations["evalue"] = float(columns[10])
     annotations["bit_score"] = float(columns[11])
     if self._alignment_representation == "BTOP":
         coordinates = self.parse_btop(columns[12])
     elif self._alignment_representation == "CIGAR":
         coordinates = self.parse_cigar(columns[12])
     coordinates[0, :] += target_start
     query_size = self._query_size
     if query_start < query_end:
         coordinates[1, :] += query_start
     else:
         # mapped to reverse strand
         coordinates[1, :] = coordinates[1, ::-1]
         coordinates[1, :] += query_size - query_start - 1
     query_sequence = Seq(None, length=query_size)
     query = SeqRecord(query_sequence, id=query_id)
     if self._query_description is not None:
         query.description = self._query_description
     target_sequence = Seq(None, length=target_end)
     target = SeqRecord(target_sequence, id=target_id)
     records = [target, query]
     alignment = Alignment(records, coordinates)
     alignment.annotations = annotations
     return alignment
Пример #2
0
 def create_alignment(
     records,
     aligned_sequences,
     strands,
     annotations,
     column_annotations,
     score,
 ):
     """Create the Alignment object from the collected alignment data."""
     coordinates = Alignment.infer_coordinates(aligned_sequences)
     for record, strand, row in zip(records, strands, coordinates):
         if strand == "-":
             row[:] = row[-1] - row[0] - row
         start = record.seq.defined_ranges[0][0]
         row += start
     alignment = Alignment(records, coordinates)
     if annotations is not None:
         alignment.annotations = annotations
     if column_annotations is not None:
         alignment.column_annotations = column_annotations
     if score is not None:
         alignment.score = score
     return alignment
Пример #3
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        identifiers = None
        number_of_sequences = None
        annotations = {}
        for line in stream:
            line = line.rstrip("\r\n")
            if identifiers is None:
                # searching for alignment metadata start
                if not line:
                    continue
                elif line.startswith("#---------------------------------------"):
                    # may appear between alignments
                    continue
                elif line.startswith("#======================================="):
                    # found the alignment metadata start
                    identifiers = []
                    ncols = None
                    sequences = None
                else:
                    raise ValueError("Unexpected line: %s" % line)
            elif sequences is None:
                # parsing the alignment metadata
                if line == "#=======================================":
                    # reached the end of alignment metadata
                    if len(identifiers) == 0:
                        raise ValueError("Number of sequences missing!")
                    if ncols is None:
                        raise ValueError("Length of alignment missing!")
                    sequences = [""] * number_of_sequences
                    aligned_sequences = [""] * number_of_sequences
                    consensus = ""
                    starts = [0] * number_of_sequences
                    column = 0
                    index = 0
                    continue
                if line.strip() == "#":
                    continue
                if not line.startswith("# "):
                    raise ValueError("Unexpected line: %s") % line
                try:
                    key, value = line[2:].split(":", 1)
                except ValueError:
                    # An equal sign is used for Longest_Identity,
                    # Longest_Similarity, Shortest_Identity, and
                    # Shortest_Similarity, which are included if command line
                    # argument -nobrief was used.
                    key, value = line[2:].split(" = ", 1)
                if key == "Aligned_sequences":
                    number_of_sequences = int(value.strip())
                    assert len(identifiers) == 0
                    # Should now expect the record identifiers...
                    for i, line in enumerate(stream):
                        if not line.startswith("# "):
                            raise ValueError("Unexpected line: %s") % line
                        number, identifier = line[2:].split(":")
                        assert i + 1 == int(number)
                        identifiers.append(identifier.strip())
                        if len(identifiers) == number_of_sequences:
                            break
                elif key == "Matrix":
                    annotations["matrix"] = value.strip()
                elif key == "Gap_penalty":
                    annotations["gap_penalty"] = float(value.strip())
                elif key == "Extend_penalty":
                    annotations["extend_penalty"] = float(value.strip())
                elif key == "Length":
                    ncols = int(value.strip())
                elif key == "Identity":
                    annotations["identity"] = int(value.strip().split("/")[0])
                elif key == "Similarity":
                    annotations["similarity"] = int(value.strip().split("/")[0])
                elif key == "Gaps":
                    annotations["gaps"] = int(value.strip().split("/")[0])
                elif key == "Score":
                    annotations["score"] = float(value.strip())
                # TODO:
                # The following are generated if the -nobrief command line
                # argument used. We could simply calculate them from the
                # alignment, but then we have to define what we mean by
                # "similar". For now, simply store them as an annotation.
                elif key == "Longest_Identity":
                    annotations["longest_identity"] = value.strip()
                elif key == "Longest_Similarity":
                    annotations["longest_similarity"] = value.strip()
                elif key == "Shortest_Identity":
                    annotations["shortest_identity"] = value.strip()
                elif key == "Shortest_Similarity":
                    annotations["shortest_similarity"] = value.strip()
                else:
                    raise ValueError("Failed to parse line '%s'" % line)
            else:
                # parse the sequences
                if not line:
                    # empty line
                    if index == number_of_sequences:
                        # reached the end of an alignment block
                        index = 0
                        if column == ncols:
                            # reached the end of the sequences
                            coordinates = Alignment.infer_coordinates(aligned_sequences)
                            records = []
                            n = len(sequences)
                            for i in range(n):
                                start = starts[i]
                                if start == 0:
                                    sequence = Seq(sequences[i])
                                else:
                                    coordinates[i, :] += start
                                    # create a partially defined sequence
                                    length = start + len(sequences[i])
                                    data = {start: sequences[i]}
                                    sequence = Seq(data, length=length)
                                record = SeqRecord(sequence, identifiers[i])
                                records.append(record)
                            alignment = Alignment(records, coordinates)
                            if annotations:
                                alignment.annotations = annotations
                            if consensus:
                                alignment.column_annotations = {
                                    "emboss_consensus": consensus
                                }
                            yield alignment
                            identifiers = None
                            annotations = {}
                    continue
                prefix = line[:21].strip()
                if prefix == "":
                    # match line
                    consensus += line[21:71]
                else:
                    identifier, start = prefix.split(None, 1)
                    assert identifiers[index].startswith(identifier)
                    aligned_sequence, end = line[21:].split(None, 1)
                    start = int(start) - 1  # Python counting
                    end = int(end)
                    length = len(sequences[index])
                    sequence = aligned_sequence.replace("-", "")
                    if length == 0 and len(sequence) > 0:
                        # Record the start
                        starts[index] = start
                    else:
                        if self.align_format == "srspair" and len(sequence) == 0:
                            start += 1
                        assert start == starts[index] + length
                    assert end == start + len(sequence)
                    sequences[index] += sequence
                    aligned_sequences[index] += aligned_sequence
                    if index == 0:
                        column += len(aligned_sequence)
                    else:
                        assert column == len(aligned_sequences[index])
                    index += 1
Пример #4
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        for line in stream:
            line = line.strip()
            if not line:
                continue
            elif line == "# STOCKHOLM 1.0":
                # Starting a new alignment
                records = []
                aligned_sequences = []
                references = []
                reference_comments = []
                database_references = []
                nested_domains = []
                gf = defaultdict(list)
                gc = {}
                gs = defaultdict(lambda: {"DR": []})
                gr = defaultdict(dict)
                length = None
            elif line == "//":
                # Reached the end of the alignment.
                skipped_columns = []
                coordinates = Alignment.infer_coordinates(
                    aligned_sequences, skipped_columns
                )
                skipped_columns = set(skipped_columns)
                alignment = Alignment(records, coordinates)
                alignment.annotations = {}
                if references:
                    alignment.annotations["references"] = []
                    for reference in references:
                        reference = dict(reference)
                        reference["title"] = " ".join(reference["title"])
                        reference["author"] = " ".join(reference["author"])
                        reference["location"] = " ".join(reference["location"])
                        alignment.annotations["references"].append(reference)
                if database_references:
                    alignment.annotations["database references"] = database_references
                if nested_domains:
                    alignment.annotations["nested domains"] = nested_domains
                rows, columns = alignment.shape
                AlignmentIterator._store_per_file_annotations(alignment, gf, rows)
                AlignmentIterator._store_per_column_annotations(
                    alignment, gc, columns, skipped_columns
                )
                AlignmentIterator._store_per_sequence_annotations(alignment, gs)
                AlignmentIterator._store_per_sequence_and_per_column_annotations(
                    alignment, gr
                )
                yield alignment
            elif not line.startswith("#"):
                # Sequence
                # Format: "<seqname> <sequence>"
                try:
                    seqname, aligned_sequence = line.split(None, 1)
                except ValueError:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError(
                        "Could not split line into sequence name and aligned sequence:\n"
                        + line
                    ) from None
                if length is None:
                    length = len(aligned_sequence)
                elif length != len(aligned_sequence):
                    raise ValueError(
                        f"Aligned sequence {seqname} consists of {len(aligned_sequence)} letters, expected {length} letters)"
                    )
                aligned_sequence = aligned_sequence.replace(".", "-")
                sequence = aligned_sequence.replace("-", "")
                aligned_sequences.append(aligned_sequence)
                seq = Seq(sequence)
                record = SeqRecord(seq, id=seqname)
                records.append(record)
            elif line.startswith("#=GF "):
                # Generic per-File annotation, free text
                # Format: #=GF <feature> <free text>
                feature, text = line[5:].strip().split(None, 1)
                if feature == "RN":
                    assert text.startswith("[")
                    assert text.endswith("]")
                    number = int(text[1:-1])
                    reference = defaultdict(list)
                    reference["number"] = number
                    if reference_comments:
                        reference["comment"] = " ".join(reference_comments)
                        reference_comments = []
                    references.append(reference)
                elif feature == "RM":
                    assert not reference["medline"]
                    reference["medline"] = text
                elif feature == "RT":
                    reference["title"].append(text)
                elif feature == "RA":
                    reference["author"].append(text)
                elif feature == "RL":
                    reference["location"].append(text)
                elif feature == "RC":
                    reference_comments.append(text)
                elif feature == "DR":
                    database_reference = {"reference": text}
                    database_references.append(database_reference)
                elif feature == "DC":
                    assert "comment" not in database_reference
                    database_reference["comment"] = text
                elif feature == "NE":
                    nested_domain = {"accession": text}
                    nested_domains.append(nested_domain)
                elif feature == "NL":
                    assert "location" not in nested_domain
                    nested_domain["location"] = text
                else:
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    gf[feature].append(text)
            elif line.startswith("#=GC "):
                # Generic per-Column annotation, exactly 1 char per column
                # Format: "#=GC <feature> <exactly 1 char per column>"
                feature, text = line[5:].strip().split(None, 2)
                if feature not in gc:
                    gc[feature] = ""
                gc[feature] += text.strip()  # append to any previous entry
                # Might be interleaved blocks, so can't check length yet
            elif line.startswith("#=GS "):
                # Generic per-Sequence annotation, free text
                # Format: "#=GS <seqname> <feature> <free text>"
                try:
                    seqname, feature, text = line[5:].strip().split(None, 2)
                except ValueError:
                    # Free text can sometimes be empty, which a one line split throws an error for.
                    # See https://github.com/biopython/biopython/issues/2982 for more details
                    seqname, feature = line[5:].strip().split(None, 1)
                    text = ""
                if feature == "DR":
                    gs[seqname][feature].append(text)
                else:
                    assert feature not in gs[seqname]
                    gs[seqname][feature] = text
            elif line[:5] == "#=GR ":
                # Generic per-Sequence AND per-Column markup
                # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                terms = line[5:].split(None, 2)
                assert terms[0] == seqname
                feature = terms[1]
                gr[seqname][feature] = terms[2].strip()