예제 #1
0
    def test_no_protein_support(self):
        """Testing no protein support for embl"""
        # TODO: add protein support

        # a fake protein line.
        handle = io.StringIO('ID   M14399; SV 1; linear; mRNA; STD; '
                             'PRO; 63 AA.\n//\n')

        with self.assertRaisesRegex(EMBLFormatError,
                                    r"There's no protein support for EMBL "
                                    "record"):
            # read a protein record
            Protein.read(handle)

        # return to 0
        handle.seek(0)

        with self.assertRaisesRegex(EMBLFormatError,
                                    r"There's no protein support for EMBL "
                                    "record"):
            # read a generic record
            skbio.io.read(handle, format='embl')
예제 #2
0
    def test_no_protein_support(self):
        """Testing no protein support for embl"""
        # TODO: add protein support

        # a fake protein line.
        handle = io.StringIO('ID   M14399; SV 1; linear; mRNA; STD; '
                             'PRO; 63 AA.\n//\n')

        with self.assertRaisesRegex(
                EMBLFormatError, r"There's no protein support for EMBL "
                "record"):
            # read a protein record
            Protein.read(handle)

        # return to 0
        handle.seek(0)

        with self.assertRaisesRegex(
                EMBLFormatError, r"There's no protein support for EMBL "
                "record"):
            # read a generic record
            skbio.io.read(handle, format='embl')
예제 #3
0
def mask_sequence(hhsuite_fp,
                  fullsequence_fp,
                  subsequences_fp=None,
                  min_prob=None,
                  max_pvalue=None,
                  max_evalue=None,
                  min_fragment_length=0,
                  min_identity=0):
    """ Splits a protein sequence according to HHsuits results.

    The returned sub-sequences will seamlessly build the full sequence if
    re-concatenated.

    Parameters
    ----------
    hhsuite_fp : str
        Filepath to HHblits/HHsearch output.
    fullsequence_fp : str
        Filepath to the protein sequence of the original query.
    subsequences_fp : str
        Filepath to which sub-sequences are written as a multiple fasta file.
        Each sequence makes up one header and one sequence file, i.e. sequences
        are not wrapped.
        Two files will be produced, suffixed by '.match' and '.non_match'. The
        first holds sub-sequences of hits, the second holds the none-hit
        covered subsequences.
        Default: None, i.e. no file is written.
    min_prob: float
        Minimal probability of a hit to be included in the resulting list.
        Note: probabilities are in the range of 100.0 to 0.0.
        Default: None, i.e. no filtering on probability.
    max_pvalue: float
        Maximal P-value of a hit to be included in the resulting list.
        Default: None, i.e. no filtering on P-value.
    max_evalue: float
        Maximal E-value of a hit to be included in the resulting list.
        Default: None, i.e. no filtering on E-value.
    min_fragment_length: int
        Minimal fragment length of a hit to be included in the resulting list.
        Default: 0, i.e. no filtering on fragment length.
    min_identity: float
        Minimum pair-wise sequence identity of a hit to be included in the
        resulting list.
        Default: 0, i.e. no filtering on sequence identity.

    Returns
    -------
    [(str, str)] where first component is a fasta header, while the second is
    its fasta sequence.

    Raises
    ------
    IOError
        If the file cannot be written.

    Notes
    -----
    A hit must satisfy ALL filtering options (min_prob, max_pvalue, max_evalue,
    min_fragment_length) to be included in the resulting list.
    """

    # parse hits from file
    hits = parse_pdb_match(hhsuite_fp)

    # filter hits
    if min_prob is not None:
        hits = [hit for hit in hits if hit['Probab'] >= min_prob]
    if max_pvalue is not None:
        hits = [hit for hit in hits if hit['P-value'] <= max_pvalue]
    if max_evalue is not None:
        hits = [hit for hit in hits if hit['E-value'] <= max_evalue]
    if min_fragment_length is not None:
        hits = [hit for hit in hits if frag_size(hit) >= min_fragment_length]
    if min_identity is not None:
        hits = [hit for hit in hits if hit['Identities'] >= min_identity]

    # read the original protein file, used to run HHsearch
    p = Protein.read(fullsequence_fp, seq_num=1)
    query_id = p.metadata['id']
    query_desc = p.metadata['description']

    results = {'match': [], 'non_match': []}
    # select non overlapping positive hits
    subseqs_pos = select_hits(hits, e_value_threshold=999999)

    for hit in subseqs_pos:
        _id = get_q_id(hit)
        match_id = hit['Hit'].split()[0]
        header = "%s %s %s" % (correct_header_positions(
            query_id, hit['alignment'][_id]['start'],
            hit['alignment'][_id]['end']), '# %s' % match_id, query_desc)
        seq = hit['alignment'][_id]['sequence'].replace('-', '')
        results['match'].append((header, seq, hit['alignment'][_id]['start']))

    # collect gaps between positive hits
    subseqs_neg = report_uncovered_subsequences(subseqs_pos, str(p),
                                                min_fragment_length)
    for hit in subseqs_neg:
        header = "%s %s" % (correct_header_positions(query_id, hit['start'],
                                                     hit['end']), query_desc)
        seq = hit['sequence']
        results['non_match'].append((header, seq, hit['start']))

    # write sub-sequences to a multiple fasta file, sequences are un-wrapped
    try:
        # sort by start position
        for type_ in results:
            results[type_] = sorted(results[type_], key=lambda x: x[2])

        if subsequences_fp is not None:
            for type_ in results:
                f = open('%s.%s' % (subsequences_fp, type_), 'w')
                for res in results[type_]:
                    f.write(">%s\n%s\n" % res[:2])
                f.close()

        # removing the start position component from all subsequences
        return {
            type_: list(map(lambda x: x[:2], results[type_]))
            for type_ in results
        }
    except IOError:
        raise IOError('Cannot write to file "%s"' % subsequences_fp)
예제 #4
0
def mask_sequence(hhsuite_fp, fullsequence_fp, subsequences_fp=None,
                  min_prob=None, max_pvalue=None, max_evalue=None,
                  min_fragment_length=0):
    """ Splits a protein sequence according to HHsuits results.

    The returned sub-sequences will seamlessly build the full sequence if
    re-concatenated.

    Parameters
    ----------
    hhsuite_fp : str
        Filepath to HHblits/HHsearch output.
    fullsequence_fp : str
        Filepath to the protein sequence of the original query.
    subsequences_fp : str
        Filepath to which sub-sequences are written as a multiple fasta file.
        Each sequence makes up one header and one sequence file, i.e. sequences
        are not wrapped.
        Two files will be produced, suffixed by '.match' and '.non_match'. The
        first holds sub-sequences of hits, the second holds the none-hit
        covered subsequences.
        Default: None, i.e. no file is written.
    min_prob: float
        Minimal probability of a hit to be included in the resulting list.
        Note: probabilities are in the range of 100.0 to 0.0.
        Default: None, i.e. no filtering on probability.
    max_pvalue: float
        Maximal P-value of a hit to be included in the resulting list.
        Default: None, i.e. no filtering on P-value.
    max_evalue: float
        Maximal E-value of a hit to be included in the resulting list.
        Default: None, i.e. no filtering on E-value.
    min_fragment_length: int
        Minimal fragment length of a hit to be included in the resulting list.
        Default: 0, i.e. no filtering on fragment length.

    Returns
    -------
    [(str, str)] where first component is a fasta header, while the second is
    its fasta sequence.

    Raises
    ------
    IOError
        If the file cannot be written.

    Notes
    -----
    A hit must satisfy ALL filtering options (min_prob, max_pvalue, max_evalue,
    min_fragment_length) to be included in the resulting list.
    """

    # parse hits from file
    hits = parse_pdb_match(hhsuite_fp)

    # filter hits
    if min_prob is not None:
        hits = [hit for hit in hits if hit['Probab'] >= min_prob]
    if max_pvalue is not None:
        hits = [hit for hit in hits if hit['P-value'] <= max_pvalue]
    if max_evalue is not None:
        hits = [hit for hit in hits if hit['E-value'] <= max_evalue]
    if min_fragment_length is not None:
        hits = [hit for hit in hits if frag_size(hit) >= min_fragment_length]

    # read the original protein file, used to run HHsearch
    p = Protein.read(fullsequence_fp, seq_num=1)
    query_id = p.metadata['id']
    query_desc = p.metadata['description']

    results = {'match': [], 'non_match': []}
    # select non overlapping positive hits
    subseqs_pos = select_hits(hits, e_value_threshold=999999)

    for hit in subseqs_pos:
        _id = get_q_id(hit)
        match_id = hit['Hit'].split()[0]
        header = "%s %s %s" % (correct_header_positions(
            query_id,
            hit['alignment'][_id]['start'],
            hit['alignment'][_id]['end']), '# %s' % match_id, query_desc)
        seq = hit['alignment'][_id]['sequence'].replace('-', '')
        results['match'].append((header, seq, hit['alignment'][_id]['start']))

    # collect gaps between positive hits
    subseqs_neg = report_uncovered_subsequences(subseqs_pos, str(p),
                                                min_fragment_length)
    for hit in subseqs_neg:
        header = "%s %s" % (correct_header_positions(
            query_id,
            hit['start'],
            hit['end']), query_desc)
        seq = hit['sequence']
        results['non_match'].append((header, seq, hit['start']))

    # write sub-sequences to a multiple fasta file, sequences are un-wrapped
    try:
        # sort by start position
        for type_ in results:
            results[type_] = sorted(results[type_], key=lambda x: x[2])

        if subsequences_fp is not None:
            for type_ in results:
                f = open('%s.%s' % (subsequences_fp, type_), 'w')
                for res in results[type_]:
                    f.write(">%s\n%s\n" % res[:2])
                f.close()

        # removing the start position component from all subsequences
        return {type_: list(map(lambda x: x[:2], results[type_]))
                for type_ in results}
    except IOError:
        raise IOError('Cannot write to file "%s"' % subsequences_fp)