Exemplo n.º 1
0
def _parse_fasta_raw(fh, data_parser, format_label):
    """Raw parser for FASTA or QUAL files.

    Returns raw values (seq/qual, id, description). It is the responsibility of
    the caller to construct the correct in-memory object to hold the data.

    """
    line = next(fh)
    # header check inlined here and below for performance
    if line.startswith('>'):
        id_, desc = _parse_fasta_like_header(line)
    else:
        raise FASTAFormatError(
            "Found line without a header in %s-formatted file:\n%s" %
            (format_label, line))

    data_chunks = []
    for line in fh:
        if line.startswith('>'):
            # new header, so yield current record and reset state
            yield data_parser(data_chunks), id_, desc
            data_chunks = []
            id_, desc = _parse_fasta_like_header(line)
        else:
            line = line.strip()
            if line:
                data_chunks.append(line)
            else:
                raise FASTAFormatError(
                    "Found blank or whitespace-only line in %s-formatted "
                    "file." % format_label)
    # yield last record in file
    yield data_parser(data_chunks), id_, desc
Exemplo n.º 2
0
def _parse_fasta_raw(fh, data_parser, format_label):
    """Raw parser for FASTA or QUAL files.

    Returns raw values (seq/qual, id, description). It is the responsibility of
    the caller to construct the correct in-memory object to hold the data.

    """
    line = next(fh)
    # header check inlined here and below for performance
    if line.startswith('>'):
        id_, desc = _parse_fasta_like_header(line)
    else:
        raise FASTAFormatError(
            "Found line without a header in %s-formatted file:\n%s" %
            (format_label, line))

    data_chunks = []
    for line in fh:
        if line.startswith('>'):
            # new header, so yield current record and reset state
            yield data_parser(data_chunks), id_, desc
            data_chunks = []
            id_, desc = _parse_fasta_like_header(line)
        else:
            line = line.strip()
            if line:
                data_chunks.append(line)
            else:
                raise FASTAFormatError(
                    "Found blank or whitespace-only line in %s-formatted "
                    "file." % format_label)
    # yield last record in file
    yield data_parser(data_chunks), id_, desc
Exemplo n.º 3
0
def _fastq_to_generator(fh, variant=None, phred_offset=None,
                        constructor=Sequence):
    # Skip any blank or whitespace-only lines at beginning of file
    seq_header = next(_line_generator(fh, skip_blanks=True))

    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r"
            % seq_header)

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh, seq_header)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (seq_header[1:], qual_header[1:]))

        phred_scores, seq_header = _parse_quality_scores(fh, len(seq),
                                                         variant,
                                                         phred_offset,
                                                         qual_header)
        yield constructor(seq, metadata={'id': id_, 'description': desc},
                          positional_metadata={'quality': phred_scores})
Exemplo n.º 4
0
def _fastq_to_generator(fh, variant=None, phred_offset=None,
                        constructor=Sequence, **kwargs):
    # Skip any blank or whitespace-only lines at beginning of file
    seq_header = next(_line_generator(fh, skip_blanks=True))

    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r"
            % seq_header)

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh, seq_header)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (seq_header[1:], qual_header[1:]))

        phred_scores, seq_header = _parse_quality_scores(fh, len(seq),
                                                         variant,
                                                         phred_offset,
                                                         qual_header)
        yield constructor(seq, metadata={'id': id_, 'description': desc},
                          positional_metadata={'quality': phred_scores},
                          **kwargs)
Exemplo n.º 5
0
def _parse_fasta_raw(fh, data_parser, format_label):
    """Raw parser for FASTA or QUAL files.

    Returns raw values (seq/qual, id, description). It is the responsibility of
    the caller to construct the correct in-memory object to hold the data.

    """
    # Skip any blank or whitespace-only lines at beginning of file
    seq_header = next(_line_generator(fh, skip_blanks=True))

    # header check inlined here and below for performance
    if seq_header.startswith('>'):
        id_, desc = _parse_fasta_like_header(seq_header)
    else:
        raise FASTAFormatError(
            "Found non-header line when attempting to read the 1st %s record:"
            "\n%s" % (format_label, seq_header))

    data_chunks = []
    prev = seq_header
    for line in _line_generator(fh, skip_blanks=False):
        if line.startswith('>'):
            # new header, so yield current record and reset state
            yield data_parser(data_chunks), id_, desc
            data_chunks = []
            id_, desc = _parse_fasta_like_header(line)
        else:
            if line:
                # ensure no blank lines within a single record
                if not prev:
                    raise FASTAFormatError(
                        "Found blank or whitespace-only line within %s "
                        "record." % format_label)
                data_chunks.append(line)
        prev = line
    # yield last record in file
    yield data_parser(data_chunks), id_, desc
Exemplo n.º 6
0
def _parse_fasta_raw(fh, data_parser, error_type):
    """Raw parser for FASTA or QUAL files.

    Returns raw values (seq/qual, id, description). It is the responsibility of
    the caller to construct the correct in-memory object to hold the data.

    """
    # Skip any blank or whitespace-only lines at beginning of file
    seq_header = next(_line_generator(fh, skip_blanks=True))

    # header check inlined here and below for performance
    if seq_header.startswith('>'):
        id_, desc = _parse_fasta_like_header(seq_header)
    else:
        raise error_type(
            "Found non-header line when attempting to read the 1st record:"
            "\n%s" % seq_header)

    data_chunks = []
    prev = seq_header
    for line in _line_generator(fh, skip_blanks=False):
        if line.startswith('>'):
            # new header, so yield current record and reset state
            yield data_parser(data_chunks), id_, desc
            data_chunks = []
            id_, desc = _parse_fasta_like_header(line)
        else:
            if line:
                # ensure no blank lines within a single record
                if not prev:
                    raise error_type(
                        "Found blank or whitespace-only line within record.")
                data_chunks.append(line)
        prev = line
    # yield last record in file
    yield data_parser(data_chunks), id_, desc
Exemplo n.º 7
0
def _fastq_to_generator(fh, variant=None, phred_offset=None,
                        constructor=BiologicalSequence):
    seq_header = next(_line_generator(fh))
    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r"
            % seq_header)

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (seq_header[1:], qual_header[1:]))

        phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant,
                                                         phred_offset)
        yield constructor(seq, id=id_, description=desc, quality=phred_scores)
Exemplo n.º 8
0
def _fastq_to_generator(fh,
                        variant=None,
                        phred_offset=None,
                        constructor=BiologicalSequence):
    seq_header = next(_line_generator(fh))
    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r" %
            seq_header)

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (seq_header[1:], qual_header[1:]))

        phred_scores, seq_header = _parse_quality_scores(
            fh, len(seq), variant, phred_offset)
        yield constructor(seq, id=id_, description=desc, quality=phred_scores)
Exemplo n.º 9
0
 def test_id_and_description(self):
     obs = _parse_fasta_like_header('>!thus  suht! \t\t  \n')
     self.assertEqual(obs, ('!thus', 'suht!'))
Exemplo n.º 10
0
 def test_description_only(self):
     obs = _parse_fasta_like_header('> suht! \t\t  \n')
     self.assertEqual(obs, ('', 'suht!'))
Exemplo n.º 11
0
 def test_id_only(self):
     obs = _parse_fasta_like_header('>suht! \t\t  \n')
     self.assertEqual(obs, ('suht!', ''))
Exemplo n.º 12
0
 def test_no_id_or_description(self):
     obs = _parse_fasta_like_header('> \t\t  \n')
     self.assertEqual(obs, ('', ''))
Exemplo n.º 13
0
 def test_id_and_description(self):
     obs = _parse_fasta_like_header('>!thus  suht! \t\t  \n')
     self.assertEqual(obs, ('!thus', 'suht!'))
Exemplo n.º 14
0
 def test_description_only(self):
     obs = _parse_fasta_like_header('> suht! \t\t  \n')
     self.assertEqual(obs, ('', 'suht!'))
Exemplo n.º 15
0
 def test_id_only(self):
     obs = _parse_fasta_like_header('>suht! \t\t  \n')
     self.assertEqual(obs, ('suht!', ''))
Exemplo n.º 16
0
 def test_no_id_or_description(self):
     obs = _parse_fasta_like_header('> \t\t  \n')
     self.assertEqual(obs, ('', ''))