예제 #1
0
def _label_line_parser(record, splitter, strict=True):
    """Returns dict mapping list of data to labels, plus list with field order.

    Field order contains labels in order encountered in file.

    NOTE: doesn't care if lines are out of order in different blocks. This
    should never happen anyway, but it's possible that this behavior should
    be changed to tighten up validation.
    """
    labels = []
    result = {}
    for line in record:
        try:
            key, val = splitter(line.rstrip())
        except:
            if strict:
                raise RecordError(
                    "Failed to extract key and value from line %s" % line)
            else:
                continue  # just skip the line if not strict

        if key in result:
            result[key].append(val)
        else:
            result[key] = [val]
            labels.append(key)
    return result, labels
예제 #2
0
def parse_qual(infile, full_header=False):
    r"""yields label and qual from a qual file.


    Parameters
    ----------
    infile : open file object or str
        An open fasta file or path to it.

    full_header : bool
        Return the full header or just the id

    Returns
    -------
    label : str
        The quality label
    qual : array
        The quality at each position

    Examples
    --------
    Assume we have a qual formatted file with the following contents::

        >seq1
        10 20 30 40
        >seq2
        1 2 3 4

    >>> from StringIO import StringIO
    >>> from skbio.parse.sequences import parse_qual
    >>> qual_f = StringIO('>seq1\n'
    ...                   '10 20 30 40\n'
    ...                   '>seq2\n'
    ...                   '1 2 3 4\n')
    >>> for label, qual in parse_qual(qual_f):
    ...     print(label)
    ...     print(qual)
    seq1
    [10 20 30 40]
    seq2
    [1 2 3 4]

    """
    for rec in FastaFinder(infile):
        curr_id = rec[0][1:]
        curr_qual = ' '.join(rec[1:])
        try:
            parts = np.asarray(curr_qual.split(), dtype=int)
        except ValueError:
            raise RecordError(
                "Invalid qual file. Check the format of the qual file: each "
                "quality score must be convertible to an integer.")
        if full_header:
            curr_pid = curr_id
        else:
            curr_pid = curr_id.split()[0]
        yield (curr_pid, parts)
예제 #3
0
 def parser(lines):
     curr = []
     for l in lines:
         if constructor is not None:
             line = constructor(l)
         else:
             line = l
         if ignore(line):
             continue
         curr.append(line)
         if len(curr) == num:
             yield curr
             curr = []
     if curr:
         raise RecordError("Non-blank lines not even multiple of %s" % num)
def verify_valid_fasta_format(input_fasta_fp):
    """ Tests fasta filepath to determine if valid format

    input_fasta_fp:  fasta filepath
    """

    fasta_f = open(input_fasta_fp, "U")

    try:
        for label, seq in parse_fasta(fasta_f):
            continue
    except RecordError:
        raise RecordError("Input fasta file not valid fasta format.  Error " +
                          "found at %s label and %s sequence " % (label, seq))

    fasta_f.close()
예제 #5
0
    def parser(lines):
        curr = []
        for line in lines:
            if constructor is not None:
                line = constructor(line)
            if ignore(line):
                continue

            curr.append(line)
            # if we find the label, return the previous record
            if is_tail_line(line):
                yield curr
                curr = []

        # don't forget to return the last record in the file
        if curr:
            if strict:
                raise RecordError('lines exist after the last tail_line '
                                  'or no tail_line at all')
            else:
                yield curr
예제 #6
0
 def parser(lines):
     curr = []
     for line in lines:
         if constructor is not None:
             line = constructor(line)
         # else:
         #    line = l
         # ignore blank lines
         if ignore(line):
             continue
         # if we find the delimiter, return the line; otherwise, keep it
         if line == delimiter:
             if keep_delimiter:
                 curr.append(line)
             yield curr
             curr = []
         else:
             curr.append(line)
     if curr:
         if strict:
             raise RecordError("Found additional data after records: %s" %
                               (curr))
         else:
             yield curr
예제 #7
0
def parse_fasta(infile, strict=True, label_to_name=None, finder=FastaFinder,
                label_characters='>', ignore_comment=False):
    r"""Generator of labels and sequences from a fasta file.


    Parameters
    ----------
    infile : open file object or str
        An open fasta file or a path to a fasta file.

    strict : bool
        If ``True`` a ``RecordError`` will be raised if there is a fasta label
        line with no associated sequence, or a sequence with no associated
        label line (in other words, if there is a partial record). If
        ``False``, partial records will be skipped.

    label_to_name : function
        A function to apply to the sequence label (i.e., text on the header
        line) before yielding it. By default, the sequence label is returned
        with no processing. This function must take a single string as input
        and return a single string as output.

    finder : function
        The function to apply to find records in the fasta file. In general
        you should not have to change this.

    label_characters : str
        String used to indicate the beginning of a new record. In general you
        should not have to change this.

    ignore_comment : bool
        If `True`, split the sequence label on spaces, and return the label
        only as the first space separated field (i.e., the sequence
        identifier). Note: if both ``ignore_comment`` and ``label_to_name`` are
        passed, ``ignore_comment`` is ignored (both operate on the label, so
        there is potential for things to get messy otherwise).

    Returns
    -------
    two-item tuple of str
        yields the label and sequence for each entry.

    Raises
    ------
    RecordError
        If ``strict == True``, raises a ``RecordError`` if there is a fasta
        label line with no associated sequence, or a sequence with no
        associated label line (in other words, if there is a partial record).

    Examples
    --------
    Assume we have a fasta-formatted file with the following contents::

        >seq1 db-accession-149855
        CGATGTCGATCGATCGATCGATCAG
        >seq2 db-accession-34989
        CATCGATCGATCGATGCATGCATGCATG

    >>> from StringIO import StringIO
    >>> fasta_f = StringIO('>seq1 db-accession-149855\n'
    ...                    'CGATGTCGATCGATCGATCGATCAG\n'
    ...                    '>seq2 db-accession-34989\n'
    ...                    'CATCGATCGATCGATGCATGCATGCATG\n')

    We can parse this as follows:

    >>> from skbio.parse.sequences import parse_fasta
    >>> for label, seq in parse_fasta(fasta_f):
    ...     print(label, seq)
    seq1 db-accession-149855 CGATGTCGATCGATCGATCGATCAG
    seq2 db-accession-34989 CATCGATCGATCGATGCATGCATGCATG

    The sequence label or header line in a fasta file is defined as containing
    two separate pieces of information, delimited by a space. The first space-
    separated entry is the sequence identifier, and everything following the
    first space is considered additional information (e.g., comments about the
    source of the sequence or the molecule that it encodes). Often we don't
    care about that information within our code. If you want to just return the
    sequence identifier from that line, you can pass ``ignore_comment=True``:

    >>> from StringIO import StringIO
    >>> fasta_f = StringIO('>seq1 db-accession-149855\n'
    ...                    'CGATGTCGATCGATCGATCGATCAG\n'
    ...                    '>seq2 db-accession-34989\n'
    ...                    'CATCGATCGATCGATGCATGCATGCATG\n')

    >>> from skbio.parse.sequences import parse_fasta
    >>> for label, seq in parse_fasta(fasta_f, ignore_comment=True):
    ...     print(label, seq)
    seq1 CGATGTCGATCGATCGATCGATCAG
    seq2 CATCGATCGATCGATGCATGCATGCATG

    """

    for rec in finder(infile):
        # first line must be a label line
        if not rec[0][0] in label_characters:
            if strict:
                raise RecordError(
                    "Found Fasta record without label line: %s" % rec)
            else:
                continue
        # record must have at least one sequence
        if len(rec) < 2:
            if strict:
                raise RecordError(
                    "Found label line without sequences: %s" % rec)
            else:
                continue

        # remove the label character from the beginning of the label
        label = rec[0][1:].strip()
        # if the user passed a label_to_name function, apply that to the label
        if label_to_name is not None:
            label = label_to_name(label)
        # otherwise, if the user passed ignore_comment, split the label on
        # spaces, and return the first space separated field (i.e., the
        # sequence identifier)
        elif ignore_comment:
            label = label.split()[0]
        else:
            pass

        # join the sequence lines into a single string
        seq = ''.join(rec[1:])

        yield label, seq
예제 #8
0
def parse_qual(infile, full_header=False):
    r"""yields label and qual from a qual file.

    .. note:: Deprecated in scikit-bio 0.2.0-dev
       ``parse_qual`` will be removed in scikit-bio 0.3.0. It is replaced by
       ``read``, which is a more general method for deserializing
       FASTA/QUAL-formatted files. ``read`` supports multiple file formats,
       automatic file format detection, etc. by taking advantage of
       scikit-bio's I/O registry system. See :mod:`skbio.io` for more details.

    Parameters
    ----------
    infile : open file object or str
        An open fasta file or path to it.

    full_header : bool
        Return the full header or just the id

    Returns
    -------
    label : str
        The quality label
    qual : array
        The quality at each position

    Examples
    --------
    Assume we have a qual formatted file with the following contents::

        >seq1
        10 20 30 40
        >seq2
        1 2 3 4

    >>> from StringIO import StringIO
    >>> from skbio.parse.sequences import parse_qual
    >>> qual_f = StringIO('>seq1\n'
    ...                   '10 20 30 40\n'
    ...                   '>seq2\n'
    ...                   '1 2 3 4\n')
    >>> for label, qual in parse_qual(qual_f):
    ...     print(label)
    ...     print(qual)
    seq1
    [10 20 30 40]
    seq2
    [1 2 3 4]

    """
    warnings.warn(
        "`parse_qual` is deprecated and will be removed in scikit-bio 0.3.0. "
        "Please update your code to use "
        "`skbio.io.read(fasta_fh, qual=qual_fh, format='fasta')` to obtain a "
        "generator of `BiologicalSequence` objects (or subclasses, see the "
        "`constructor` parameter) with quality scores.", DeprecationWarning)

    for rec in FastaFinder(infile):
        curr_id = rec[0][1:]
        curr_qual = ' '.join(rec[1:])
        try:
            parts = np.asarray(curr_qual.split(), dtype=int)
        except ValueError:
            raise RecordError(
                "Invalid qual file. Check the format of the qual file: each "
                "quality score must be convertible to an integer.")
        if full_header:
            curr_pid = curr_id
        else:
            curr_pid = curr_id.split()[0]
        yield (curr_pid, parts)