Exemplo n.º 1
0
def _fastq_to_tabular_msa(fh, variant=None, phred_offset=None,
                          constructor=None, **kwargs):
    if constructor is None:
        raise ValueError("Must provide `constructor`.")

    return TabularMSA(
        _fastq_to_generator(fh, variant=variant, phred_offset=phred_offset,
                            constructor=constructor, **kwargs))
Exemplo n.º 2
0
def _clustal_to_tabular_msa(fh, constructor=None):
    r"""yields labels and sequences from msa (multiple sequence alignment)

    Parameters
    ----------
    fh : open file object
        An open Clustal file.

    Returns
    -------
    skbio.TabularMSA
        MSA containing aligned sequences.

    Raises
    ------
    skbio.util.exception.ClustalFormatError
        If the sequences in `fh` don't have the same sequence length
        or if the sequence ids don't properly match with the subsequences

    Notes
    -----
    Skips any line that starts with a blank.

    ``_clustal_to_tabular_msa`` preserves the order of the sequences from the
    original file.  However, it does use a dict as an intermediate, so
    two sequences can't have the same label. This is probably OK since
    Clustal will refuse to run on a FASTA file in which two sequences have
    the same label, but could potentially cause trouble with manually
    edited files (all the segments of the conflicting sequences would
    be interleaved, possibly in an unpredictable way).

    If the lines have trailing numbers (i.e. Clustal was run with
    `-LINENOS=ON`), silently deletes them. Does not check that the numbers
    actually correspond to the number of chars in the sequence printed so far.

    References
    ----------
    .. [1] Thompson JD, Higgins DG, Gibson TJ,  "CLUSTAL W: improving the
        sensitivity of progressive multiple sequence alignment through sequence
        weighting, position-specific gap penalties and weight matrix choice.
        Thompson", Nucleic Acids Res. 1994 Nov 11;22(22):4673-80.

    """
    if constructor is None:
        raise ValueError("Must provide `constructor`.")

    records = map(_delete_trailing_number, filter(_is_clustal_seq_line, fh))
    data, labels = _label_line_parser(records)

    aligned_correctly = _check_length(data, labels)
    if not aligned_correctly:
        raise ClustalFormatError("Sequences not aligned properly")
    seqs = []
    for label in labels:
        seqs.append(constructor(''.join(data[label])))
    return TabularMSA(seqs, index=labels)
Exemplo n.º 3
0
def _phylip_to_tabular_msa(fh, constructor=None):
    if constructor is None:
        raise ValueError("Must provide `constructor`.")

    seqs = []
    index = []
    for seq, ID in _parse_phylip_raw(fh):
        seqs.append(constructor(seq))
        index.append(ID)
    return TabularMSA(seqs, index=index)
Exemplo n.º 4
0
    def build_tabular_msa(self, constructor):
        if len(self._seqs) != len(self._seq_order):
            invalid_seq_names = set(self._seqs) - set(self._seq_order)
            raise StockholmFormatError('Found GS or GR metadata for '
                                       'nonexistent sequence(s): %r'
                                       % invalid_seq_names)

        seqs = []
        for seq_name in self._seq_order:
            seqs.append(self._seqs[seq_name].build_sequence(constructor))

        positional_metadata = self._positional_metadata
        if not positional_metadata:
            positional_metadata = None

        metadata = self._metadata
        if not metadata:
            metadata = None

        # Constructs TabularMSA
        return TabularMSA(seqs, metadata=metadata,
                          positional_metadata=positional_metadata,
                          index=self._seq_order)
Exemplo n.º 5
0
def _coerce_alignment_input_type(seq):
    if isinstance(seq, GrammaredSequence):
        return TabularMSA([seq])
    else:
        return seq
Exemplo n.º 6
0
def local_pairwise_align_ssw(sequence1, sequence2, **kwargs):
    """Align query and target sequences with Striped Smith-Waterman.

    Parameters
    ----------
    sequence1 : DNA, RNA, or Protein
        The first unaligned sequence
    sequence2 : DNA, RNA, or Protein
        The second unaligned sequence

    Returns
    -------
    tuple
        ``TabularMSA`` object containing the aligned sequences, alignment score
        (float), and start/end positions of each input sequence (iterable
        of two-item tuples). Note that start/end positions are indexes into the
        unaligned sequences.

    Notes
    -----
    This is a wrapper for the SSW package [1]_.

    For a complete list of optional keyword-arguments that can be provided,
    see ``skbio.alignment.StripedSmithWaterman``.

    The following kwargs will not have any effect: `suppress_sequences`,
    `zero_index`, and `protein`

    If an alignment does not meet a provided filter, `None` will be returned.

    References
    ----------
    .. [1] Zhao, Mengyao, Wan-Ping Lee, Erik P. Garrison, & Gabor T.
       Marth. "SSW Library: An SIMD Smith-Waterman C/C++ Library for
       Applications". PLOS ONE (2013). Web. 11 July 2014.
       http://www.plosone.org/article/info:doi/10.1371/journal.pone.0082138

    See Also
    --------
    skbio.alignment.StripedSmithWaterman

    """
    for seq in sequence1, sequence2:
        if not isinstance(seq, (DNA, RNA, Protein)):
            raise TypeError(
                "`sequence1` and `sequence2` must be DNA, RNA, or Protein, "
                "not type %r" % type(seq).__name__)

    if type(sequence1) is not type(sequence2):
        raise TypeError(
            "`sequence1` and `sequence2` must be the same type: %r != %r"
            % (type(sequence1).__name__, type(sequence2).__name__))

    # We need the sequences for `TabularMSA` to make sense, so don't let the
    # user suppress them.
    kwargs['suppress_sequences'] = False
    kwargs['zero_index'] = True

    kwargs['protein'] = False
    if isinstance(sequence1, Protein):
        kwargs['protein'] = True

    query = StripedSmithWaterman(str(sequence1), **kwargs)
    alignment = query(str(sequence2))

    # If there is no cigar, then it has failed a filter. Return None.
    if not alignment.cigar:
        return None

    start_end = None
    if alignment.query_begin != -1:
        start_end = [
            (alignment.query_begin, alignment.query_end),
            (alignment.target_begin, alignment.target_end_optimal)
        ]

    metadata1 = metadata2 = None
    if sequence1.has_metadata():
        metadata1 = sequence1.metadata
    if sequence2.has_metadata():
        metadata2 = sequence2.metadata

    constructor = type(sequence1)
    msa = TabularMSA([
        constructor(alignment.aligned_query_sequence, metadata=metadata1,
                    validate=False),
        constructor(alignment.aligned_target_sequence, metadata=metadata2,
                    validate=False)
    ])

    return msa, alignment.optimal_alignment_score, start_end
Exemplo n.º 7
0
def global_pairwise_align(seq1, seq2, gap_open_penalty, gap_extend_penalty,
                          substitution_matrix, penalize_terminal_gaps=False):
    """Globally align a pair of seqs or alignments with Needleman-Wunsch

    Parameters
    ----------
    seq1 : GrammaredSequence or TabularMSA
        The first unaligned sequence(s).
    seq2 : GrammaredSequence or TabularMSA
        The second unaligned sequence(s).
    gap_open_penalty : int or float
        Penalty for opening a gap (this is substracted from previous best
        alignment score, so is typically positive).
    gap_extend_penalty : int or float
        Penalty for extending a gap (this is substracted from previous best
        alignment score, so is typically positive).
    substitution_matrix: 2D dict (or similar)
        Lookup for substitution scores (these values are added to the
        previous best alignment score).
    penalize_terminal_gaps: bool, optional
        If True, will continue to penalize gaps even after one sequence has
        been aligned through its end. This behavior is true Needleman-Wunsch
        alignment, but results in (biologically irrelevant) artifacts when
        the sequences being aligned are of different length. This is ``False``
        by default, which is very likely to be the behavior you want in all or
        nearly all cases.

    Returns
    -------
    tuple
        ``TabularMSA`` object containing the aligned sequences, alignment score
        (float), and start/end positions of each input sequence (iterable
        of two-item tuples). Note that start/end positions are indexes into the
        unaligned sequences.

    See Also
    --------
    local_pairwise_align
    local_pairwise_align_protein
    local_pairwise_align_nucleotide
    skbio.alignment.local_pairwise_align_ssw
    global_pairwise_align_protein
    global_pairwise_align_nucelotide

    Notes
    -----
    This algorithm (in a slightly more basic form) was originally described
    in [1]_. The scikit-bio implementation was validated against the
    EMBOSS needle web server [2]_.

    This function can be use to align either a pair of sequences, a pair of
    alignments, or a sequence and an alignment.

    References
    ----------
    .. [1] A general method applicable to the search for similarities in
       the amino acid sequence of two proteins.
       Needleman SB, Wunsch CD.
       J Mol Biol. 1970 Mar;48(3):443-53.
    .. [2] http://www.ebi.ac.uk/Tools/psa/emboss_needle/

    """
    warn("You're using skbio's python implementation of Needleman-Wunsch "
         "alignment. This is known to be very slow (e.g., thousands of times "
         "slower than a native C implementation). We'll be adding a faster "
         "version soon (see https://github.com/biocore/scikit-bio/issues/254 "
         "to track progress on this).", EfficiencyWarning)

    for seq in seq1, seq2:
        # We don't need to check the case where `seq` is a `TabularMSA` with a
        # dtype that isn't a subclass of `GrammaredSequence`, this is
        # guaranteed by `TabularMSA`.
        if not isinstance(seq, (GrammaredSequence, TabularMSA)):
            raise TypeError(
                "`seq1` and `seq2` must be GrammaredSequence subclasses or "
                "TabularMSA, not type %r" % type(seq).__name__)

    seq1 = _coerce_alignment_input_type(seq1)
    seq2 = _coerce_alignment_input_type(seq2)

    if seq1.dtype is not seq2.dtype:
        raise TypeError(
            "`seq1` and `seq2` must have the same dtype: %r != %r"
            % (seq1.dtype.__name__, seq2.dtype.__name__))

    if penalize_terminal_gaps:
        init_matrices_f = _init_matrices_nw
    else:
        init_matrices_f = _init_matrices_nw_no_terminal_gap_penalty

    score_matrix, traceback_matrix = \
        _compute_score_and_traceback_matrices(
            seq1, seq2, gap_open_penalty, gap_extend_penalty,
            substitution_matrix, new_alignment_score=-np.inf,
            init_matrices_f=init_matrices_f,
            penalize_terminal_gaps=penalize_terminal_gaps)

    end_row_position = traceback_matrix.shape[0] - 1
    end_col_position = traceback_matrix.shape[1] - 1

    aligned1, aligned2, score, seq1_start_position, seq2_start_position = \
        _traceback(traceback_matrix, score_matrix, seq1, seq2,
                   end_row_position, end_col_position)
    start_end_positions = [(seq1_start_position, end_col_position-1),
                           (seq2_start_position, end_row_position-1)]

    msa = TabularMSA(aligned1 + aligned2)

    return msa, score, start_end_positions
Exemplo n.º 8
0
def local_pairwise_align(seq1, seq2, gap_open_penalty,
                         gap_extend_penalty, substitution_matrix):
    """Locally align exactly two seqs with Smith-Waterman

    Parameters
    ----------
    seq1 : GrammaredSequence
        The first unaligned sequence.
    seq2 : GrammaredSequence
        The second unaligned sequence.
    gap_open_penalty : int or float
        Penalty for opening a gap (this is substracted from previous best
        alignment score, so is typically positive).
    gap_extend_penalty : int or float
        Penalty for extending a gap (this is substracted from previous best
        alignment score, so is typically positive).
    substitution_matrix: 2D dict (or similar)
        Lookup for substitution scores (these values are added to the
        previous best alignment score).

    Returns
    -------
    tuple
        ``TabularMSA`` object containing the aligned sequences, alignment score
        (float), and start/end positions of each input sequence (iterable
        of two-item tuples). Note that start/end positions are indexes into the
        unaligned sequences.

    See Also
    --------
    local_pairwise_align_protein
    local_pairwise_align_nucleotide
    skbio.alignment.local_pairwise_align_ssw
    global_pairwise_align
    global_pairwise_align_protein
    global_pairwise_align_nucelotide

    Notes
    -----
    This algorithm was originally described in [1]_. The scikit-bio
    implementation was validated against the EMBOSS water web server [2]_.

    References
    ----------
    .. [1] Identification of common molecular subsequences.
       Smith TF, Waterman MS.
       J Mol Biol. 1981 Mar 25;147(1):195-7.
    .. [2] http://www.ebi.ac.uk/Tools/psa/emboss_water/

    """
    warn("You're using skbio's python implementation of Smith-Waterman "
         "alignment. This will be very slow (e.g., thousands of times slower) "
         "than skbio.alignment.local_pairwise_align_ssw.",
         EfficiencyWarning)

    for seq in seq1, seq2:
        if not isinstance(seq, GrammaredSequence):
            raise TypeError(
                "`seq1` and `seq2` must be %r subclasses, not type %r" %
                (GrammaredSequence.__name__, type(seq).__name__))

    if type(seq1) is not type(seq2):
        raise TypeError(
            "`seq1` and `seq2` must be the same type: %r != %r"
            % (type(seq1).__name__, type(seq2).__name__))

    seq1 = _coerce_alignment_input_type(seq1)
    seq2 = _coerce_alignment_input_type(seq2)

    score_matrix, traceback_matrix = _compute_score_and_traceback_matrices(
        seq1, seq2, gap_open_penalty, gap_extend_penalty,
        substitution_matrix, new_alignment_score=0.0,
        init_matrices_f=_init_matrices_sw)

    end_row_position, end_col_position =\
        np.unravel_index(np.argmax(score_matrix), score_matrix.shape)

    aligned1, aligned2, score, seq1_start_position, seq2_start_position = \
        _traceback(traceback_matrix, score_matrix, seq1, seq2,
                   end_row_position, end_col_position)
    start_end_positions = [(seq1_start_position, end_col_position-1),
                           (seq2_start_position, end_row_position-1)]

    msa = TabularMSA(aligned1 + aligned2)

    return msa, score, start_end_positions
Exemplo n.º 9
0
def _fasta_to_tabular_msa(fh, qual=FileSentinel, constructor=None, **kwargs):
    if constructor is None:
        raise ValueError("Must provide `constructor`.")

    return TabularMSA(
        _fasta_to_generator(fh, qual=qual, constructor=constructor, **kwargs))