예제 #1
0
def _parse_sequence_data(fh, prev):
    seq_chunks = []
    for chunk in _line_generator(fh, skip_blanks=False):
        if chunk.startswith('+'):
            if not prev:
                _blank_error("before '+'")
            if not seq_chunks:
                raise FASTQFormatError(
                    "Found FASTQ record without sequence data.")
            return ''.join(seq_chunks), chunk
        elif chunk.startswith('@'):
            raise FASTQFormatError(
                "Found FASTQ record that is missing a quality (+) header line "
                "after sequence data.")
        else:
            if not prev:
                _blank_error("after header or within sequence")
            if _whitespace_regex.search(chunk):
                raise FASTQFormatError(
                    "Found whitespace in sequence data: %r" % str(chunk))
            seq_chunks.append(chunk)
        prev = chunk

    raise FASTQFormatError(
        "Found incomplete/truncated FASTQ record at end of file.")
예제 #2
0
def _parse_records(fh, constructor=None, **kwargs):
    res = None
    header_md = {}
    n = len(_REQUIRED_FIELDS)
    for line in _line_generator(fh, skip_blanks=True, strip=True):
        md = {}
        # parse the header (would be nice to abstract this pattern out)
        if line.startswith('@'):
            key, val = line.split('\t', 1)
            if key != '@CO':
                header_md[key] = val
        # parse the actual sequences
        else:
            tabs = line.split('\t')
            # zip stops generating after the shorter list of the two
            md = {
                k: _parse_required(v)
                for k, v in zip(_REQUIRED_FIELDS, tabs)
            }

            seq = md.pop('SEQ')

            opt = (_parse_optional(field) for field in tabs[n:])
            md.update(opt)
            md.update(header_md)
            res = seq, md
            yield res
예제 #3
0
def _parse_quality_scores(fh, seq_len, variant, phred_offset, prev):
    phred_scores = []
    qual_len = 0
    for chunk in _line_generator(fh, skip_blanks=False):
        if chunk:
            if chunk.startswith('@') and qual_len == seq_len:
                return np.hstack(phred_scores), chunk
            else:
                if not prev:
                    _blank_error("after '+' or within quality scores")
                qual_len += len(chunk)

                if qual_len > seq_len:
                    raise FASTQFormatError(
                        "Found more quality score characters than sequence "
                        "characters. Extra quality score characters: %r" %
                        chunk[-(qual_len - seq_len):])

                phred_scores.append(
                    _decode_qual_to_phred(chunk, variant=variant,
                                          phred_offset=phred_offset))
        prev = chunk

    if qual_len != seq_len:
        raise FASTQFormatError(
            "Found incomplete/truncated FASTQ record at end of file.")
    return np.hstack(phred_scores), None
예제 #4
0
def _fastq_to_generator(fh,
                        variant=None,
                        phred_offset=None,
                        constructor=Sequence,
                        **kwargs):
    # Skip any blank or whitespace-only lines at beginning of file
    seq_header = next(_line_generator(fh, skip_blanks=True))

    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r" %
            str(seq_header))

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh, seq_header)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (str(seq_header[1:]), str(qual_header[1:])))

        phred_scores, seq_header = _parse_quality_scores(
            fh, len(seq), variant, phred_offset, qual_header)
        yield constructor(seq,
                          metadata={
                              'id': id_,
                              'description': desc
                          },
                          positional_metadata={'quality': phred_scores},
                          **kwargs)
예제 #5
0
def _fastq_to_generator(fh, variant=None, phred_offset=None,
                        constructor=Sequence, **kwargs):
    # Skip any blank or whitespace-only lines at beginning of file
    seq_header = next(_line_generator(fh, skip_blanks=True))

    if not seq_header.startswith('@'):
        raise FASTQFormatError(
            "Expected sequence (@) header line at start of file: %r"
            % str(seq_header))

    while seq_header is not None:
        id_, desc = _parse_fasta_like_header(seq_header)
        seq, qual_header = _parse_sequence_data(fh, seq_header)

        if qual_header != '+' and qual_header[1:] != seq_header[1:]:
            raise FASTQFormatError(
                "Sequence (@) and quality (+) header lines do not match: "
                "%r != %r" % (str(seq_header[1:]), str(qual_header[1:])))

        phred_scores, seq_header = _parse_quality_scores(fh, len(seq),
                                                         variant,
                                                         phred_offset,
                                                         qual_header)
        yield constructor(seq, metadata={'id': id_, 'description': desc},
                          positional_metadata={'quality': phred_scores},
                          **kwargs)
예제 #6
0
def _parse_sequence_data(fh, prev):
    seq_chunks = []
    for chunk in _line_generator(fh, skip_blanks=False):
        if chunk.startswith('+'):
            if not prev:
                _blank_error("before '+'")
            if not seq_chunks:
                raise FASTQFormatError(
                    "Found FASTQ record without sequence data.")
            return ''.join(seq_chunks), chunk
        elif chunk.startswith('@'):
            raise FASTQFormatError(
                "Found FASTQ record that is missing a quality (+) header line "
                "after sequence data.")
        else:
            if not prev:
                _blank_error("after header or within sequence")
            if _whitespace_regex.search(chunk):
                raise FASTQFormatError(
                    "Found whitespace in sequence data: %r" % str(chunk))
            seq_chunks.append(chunk)
        prev = chunk

    raise FASTQFormatError(
        "Found incomplete/truncated FASTQ record at end of file.")
예제 #7
0
def _yield_record(fh):
    '''Yield (seq_id, lines) that belong to the same sequence.'''
    lines = []
    current = False
    for line in _line_generator(fh, skip_blanks=True, strip=True):
        if line.startswith('##sequence-region'):
            _, seq_id, start, end = line.split()
            length = int(end) - int(start) + 1
            yield 'length', seq_id, length
        if line.startswith('##FASTA'):
            # stop once reaching to sequence section
            break
        if not line.startswith('#'):
            try:
                seq_id, _ = line.split('\t', 1)
            except ValueError:
                raise GFF3FormatError('Wrong GFF3 format at line: %s' % line)
            if current == seq_id:
                lines.append(line)
            else:
                if current is not False:
                    yield 'data', current, lines
                lines = [line]
                current = seq_id
    if current is False:
        # if the input file object is empty, it should return
        # an empty generator
        return
        yield
    else:
        yield 'data', current, lines
예제 #8
0
파일: gff3.py 프로젝트: ElDeveloper/biolopy
def _yield_record(fh):
    '''Yield (seq_id, lines) that belong to the same sequence.'''
    lines = []
    current = False
    for line in _line_generator(fh, skip_blanks=True, strip=True):
        if line.startswith('##sequence-region'):
            _, seq_id, start, end = line.split()
            length = int(end) - int(start) + 1
            yield 'length', seq_id, length
        if line.startswith('##FASTA'):
            # stop once reaching to sequence section
            break
        if not line.startswith('#'):
            try:
                seq_id, _ = line.split('\t', 1)
            except ValueError:
                raise GFF3FormatError(
                    'Wrong GFF3 format at line: %s' % line)
            if current == seq_id:
                lines.append(line)
            else:
                if current is not False:
                    yield 'data', current, lines
                lines = [line]
                current = seq_id
    if current is False:
        # if the input file object is empty, it should return
        # an empty generator
        return
        yield
    else:
        yield 'data', current, lines
예제 #9
0
def _parse_quality_scores(fh, seq_len, variant, phred_offset, prev):
    phred_scores = []
    qual_len = 0
    for chunk in _line_generator(fh, skip_blanks=False):
        if chunk:
            if chunk.startswith('@') and qual_len == seq_len:
                return np.hstack(phred_scores), chunk
            else:
                if not prev:
                    _blank_error("after '+' or within quality scores")
                qual_len += len(chunk)

                if qual_len > seq_len:
                    raise FASTQFormatError(
                        "Found more quality score characters than sequence "
                        "characters. Extra quality score characters: %r" %
                        chunk[-(qual_len - seq_len):])

                phred_scores.append(
                    _decode_qual_to_phred(chunk,
                                          variant=variant,
                                          phred_offset=phred_offset))
        prev = chunk

    if qual_len != seq_len:
        raise FASTQFormatError(
            "Found incomplete/truncated FASTQ record at end of file.")
    return np.hstack(phred_scores), None
예제 #10
0
파일: embl.py 프로젝트: sjanssen2/micronota
def _parse_records(fh, parser):
    data_chunks = []
    for line in _line_generator(fh, skip_blanks=True, strip=False):
        if line.startswith('//'):
            yield parser(data_chunks)
            data_chunks = []
        else:
            data_chunks.append(line)
예제 #11
0
def _parse_genbanks(fh):
    data_chunks = []
    for line in _line_generator(fh, skip_blanks=True, strip=False):
        if line.startswith('//'):
            yield _parse_single_genbank(data_chunks)
            data_chunks = []
        else:
            data_chunks.append(line)
예제 #12
0
파일: embl.py 프로젝트: biocore/micronota
def _parse_records(fh, parser):
    data_chunks = []
    for line in _line_generator(fh, skip_blanks=True, strip=False):
        if line.startswith('//'):
            yield parser(data_chunks)
            data_chunks = []
        else:
            data_chunks.append(line)
예제 #13
0
def _parse_genbanks(fh):
    data_chunks = []
    for line in _line_generator(fh, skip_blanks=True, strip=False):
        if line.startswith('//'):
            yield _parse_single_genbank(data_chunks)
            data_chunks = []
        else:
            data_chunks.append(line)
예제 #14
0
 def parser(lines):
     curr = []
     for line in _line_generator(lines, **kwargs):
         # if we find another, return the previous section
         if is_another_section(line):
             if curr:
                 yield curr
                 curr = []
         curr.append(line)
     # don't forget to return the last section in the file
     if curr:
         yield curr
예제 #15
0
 def parser(lines):
     curr = []
     for line in _line_generator(lines, **kwargs):
         # if we find another, return the previous section
         if is_another_section(line):
             if curr:
                 yield curr
                 curr = []
         curr.append(line)
     # don't forget to return the last section in the file
     if curr:
         yield curr
예제 #16
0
파일: embl.py 프로젝트: biocore/micronota
def _embl_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if line.startswith('ID'):
        return True, {}
    else:
        return False, {}
예제 #17
0
def _parse_fasta_raw(fh, data_parser, error_type):
    """Raw parser for FASTA or QUAL files.

    Returns raw values (seq/qual, id, description). It is the responsibility of
    the caller to construct the correct in-memory object to hold the data.

    """
    # Skip any blank or whitespace-only lines at beginning of file
    try:
        seq_header = next(_line_generator(fh, skip_blanks=True))
    except StopIteration:
        return

    # header check inlined here and below for performance
    if seq_header.startswith('>'):
        id_, desc = _parse_fasta_like_header(seq_header)
    else:
        raise error_type(
            "Found non-header line when attempting to read the 1st record:"
            "\n%s" % seq_header)

    data_chunks = []
    prev = seq_header
    for line in _line_generator(fh, skip_blanks=False):
        if line.startswith('>'):
            # new header, so yield current record and reset state
            yield data_parser(data_chunks), id_, desc
            data_chunks = []
            id_, desc = _parse_fasta_like_header(line)
        else:
            if line:
                # ensure no blank lines within a single record
                if not prev:
                    raise error_type(
                        "Found blank or whitespace-only line within record.")
                data_chunks.append(line)
        prev = line
    # yield last record in file
    yield data_parser(data_chunks), id_, desc
예제 #18
0
파일: embl.py 프로젝트: sjanssen2/micronota
def _embl_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if line.startswith('ID'):
        return True, {}
    else:
        return False, {}
예제 #19
0
def _parse_fasta_raw(fh, data_parser, error_type):
    """Raw parser for FASTA or QUAL files.

    Returns raw values (seq/qual, id, description). It is the responsibility of
    the caller to construct the correct in-memory object to hold the data.

    """
    # Skip any blank or whitespace-only lines at beginning of file
    try:
        seq_header = next(_line_generator(fh, skip_blanks=True))
    except StopIteration:
        return

    # header check inlined here and below for performance
    if seq_header.startswith('>'):
        id_, desc = _parse_fasta_like_header(seq_header)
    else:
        raise error_type(
            "Found non-header line when attempting to read the 1st record:"
            "\n%s" % seq_header)

    data_chunks = []
    prev = seq_header
    for line in _line_generator(fh, skip_blanks=False):
        if line.startswith('>'):
            # new header, so yield current record and reset state
            yield data_parser(data_chunks), id_, desc
            data_chunks = []
            id_, desc = _parse_fasta_like_header(line)
        else:
            if line:
                # ensure no blank lines within a single record
                if not prev:
                    raise error_type(
                        "Found blank or whitespace-only line within record.")
                data_chunks.append(line)
        prev = line
    # yield last record in file
    yield data_parser(data_chunks), id_, desc
예제 #20
0
def _gff3_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if re.match(r'##gff-version\s+3', line):
        return True, {}
    else:
        return False, {}
예제 #21
0
def _genbank_sniffer(fh):
    # check the 1st real line is a valid LOCUS line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    try:
        _parse_locus([line])
    except GenBankFormatError:
        return False, {}
    return True, {}
예제 #22
0
def _genbank_sniffer(fh):
    # check the 1st real line is a valid LOCUS line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    try:
        _parse_locus([line])
    except GenBankFormatError:
        return False, {}
    return True, {}
예제 #23
0
파일: gff3.py 프로젝트: ElDeveloper/biolopy
def _gff3_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if re.match(r'##gff-version\s+3', line):
        return True, {}
    else:
        return False, {}
예제 #24
0
파일: embl.py 프로젝트: biocore/micronota
def _yield_section(lines, split_header, **kwargs):
    '''Yield the lines with the same header.

    Parameters
    ----------
    split_header : function
        It accepts a string of line and returns the header and
        rest of the needed data as a tuple. If no header exists,
        return ``None`` and the data.

    Notes
    -----
    The following example is a valid section::

    DR   GO GO:000001
    DR   GO GO:000002

    This is also a section::

    SQ   ATGCA ATGCA
         ATGCA ATGCA
    '''
    curr = []
    header, _ = split_header(lines[0])

    for line in _line_generator(lines, **kwargs):
        items = split_header(line)

        # if the header is changed, it is a new section
        if items[0] is not None and items[0] != header:
            if curr:
                yield header, curr
                curr = []
                header = items[0]

        curr.append(items[1].strip())
    # don't forget to return the last section in the file
    if curr:
        yield header, curr
예제 #25
0
파일: embl.py 프로젝트: sjanssen2/micronota
def _yield_section(lines, split_header, **kwargs):
    '''Yield the lines with the same header.

    Parameters
    ----------
    split_header : function
        It accepts a string of line and returns the header and
        rest of the needed data as a tuple. If no header exists,
        return ``None`` and the data.

    Notes
    -----
    The following example is a valid section::

    DR   GO GO:000001
    DR   GO GO:000002

    This is also a section::

    SQ   ATGCA ATGCA
         ATGCA ATGCA
    '''
    curr = []
    header, _ = split_header(lines[0])

    for line in _line_generator(lines, **kwargs):
        items = split_header(line)

        # if the header is changed, it is a new section
        if items[0] is not None and items[0] != header:
            if curr:
                yield header, curr
                curr = []
                header = items[0]

        curr.append(items[1].strip())
    # don't forget to return the last section in the file
    if curr:
        yield header, curr
예제 #26
0
파일: sam.py 프로젝트: biocore/micronota
def _parse_records(fh, constructor=None, **kwargs):
    res = None
    header_md = {}
    n = len(_REQUIRED_FIELDS)
    for line in _line_generator(fh, skip_blanks=True, strip=True):
        md = {}
        # parse the header (would be nice to abstract this pattern out)
        if line.startswith('@'):
            key, val = line.split('\t', 1)
            if key != '@CO':
                header_md[key] = val
        # parse the actual sequences
        else:
            tabs = line.split('\t')
            # zip stops generating after the shorter list of the two
            md = {k: _parse_required(v) for k, v in zip(_REQUIRED_FIELDS, tabs)}

            seq = md.pop('SEQ')

            opt = (_parse_optional(field) for field in tabs[n:])
            md.update(opt)
            md.update(header_md)
            res = seq, md
            yield res
예제 #27
0
def _parse_records(fh):
    """ parses an tabular output file, generated by either cmsearch or cmscan
    of the Infernal package, to a collection of IntervalMetadata objects

    Parameters
    ----------
    fh : file handle
         A file handle to the file that should be parsed

    Returns
    -------
    A collection of IntervalMetadata objects
    """
    program = 'CMSCAN'
    currentsequence = False
    annotations = {}
    for line in _line_generator(fh, skip_blanks=True, strip=True):
        if not line.startswith('#'):
            attributes = {}
            fields = line.split()  # split at multiple occurrences of spaces

            # checking data for start end end position of the hit in the query
            # sequence.
            hitStart = -1
            if fields[_COLUMNS.index('SEQUENCE_START_POSITION')].isdigit():
                hitStart = int(fields[_COLUMNS.index('SEQUENCE_START_POSITION')])
            else:
                raise CmscanFormatError("%s %i %s '%s'." % (
                       "Column",
                       _COLUMNS.index('SEQUENCE_START_POSITION'),
                       "must be an integer value for the start position of the"
                       " hit. Here, it is",
                       fields[_COLUMNS.index('SEQUENCE_START_POSITION')]
                      ))

            hitEnd = -1
            if fields[_COLUMNS.index('SEQUENCE_END_POSITION')].isdigit():
                hitEnd = int(fields[_COLUMNS.index('SEQUENCE_END_POSITION')])
            else:
                raise CmscanFormatError("%s %i %s '%s'." % (
                        "Column",
                        _COLUMNS.index('SEQUENCE_END_POSITION'),
                        "must be an integer value for the end position of the "
                        "hit. Here, it is",
                        fields[_COLUMNS.index('SEQUENCE_END_POSITION')],
                        ".")
                      )

            hitOrientation = fields[_COLUMNS.index('STRAND')]
            if hitOrientation == "+":
                if hitStart > hitEnd:
                    raise CmscanFormatError('%s %i %s %i %s' % (
                          "On the forward strand (+), start position of a hit "
                          "must always be smaller than its end position."
                          " This is not true for the hit between",
                          hitStart,
                          "and",
                          hitEnd,
                          ". It might be, that this hit is in fact on the "
                          "reverse strand. Please check strand orientation and"
                          " positions."))
            elif hitOrientation == "-":
                if hitStart < hitEnd:
                    raise CmscanFormatError('%s %i %s %i %s' % (
                          "On the reverse strand (-), start position of a hit "
                          "must always be larger than its end position."
                          " This is not true for the hit between",
                          hitStart,
                          "and",
                          hitEnd,
                          ". It might be, that this hit is in fact on the "
                          "forward strand. Please check strand orientation and"
                          " positions."))
                else:
                    hitStart, hitEnd = hitEnd, hitStart  # swap orientation
            else:
                raise CmscanFormatError(
                    "%s '%s' %s %i. %s" %
                    ("Unknown strand character",
                     hitOrientation, "in column",
                     _COLUMNS.index('STRAND'),
                     "Valid characters are '+' for the forward strand and "
                     "'-' for the reverse strand."))

            # since Infernal want the user to be aware of differences between
            # CMsearch and CMscan the information about model and query are at
            # different columns. Here, we contradict this design and store the
            # information always at the same key
            if program is 'CMSEARCH':
                attributes['SEQUENCE_NAME'] = fields[0]
                attributes['SEQUENCE_ACCESSION'] = fields[1]
                attributes['MODEL_NAME'] = fields[2]
                attributes['MODEL_ACCESSION'] = fields[3]
            elif program is 'CMSCAN':
                attributes['SEQUENCE_NAME'] = fields[2]
                attributes['SEQUENCE_ACCESSION'] = fields[3]
                attributes['MODEL_NAME'] = fields[0]
                attributes['MODEL_ACCESSION'] = fields[1]
            else:
                raise CmscanFormatError("Argument 'program' must be either "
                                        "'CMSEARCH' or 'CMSCAN'!")

            # iterate through all keys that have not already be handled above
            for key in _COLUMNS:
                if key in ['SEQUENCE_START_POSITION',
                           'SEQUENCE_END_POSITION',
                           'SEQUENCE_NAME',
                           'SEQUENCE_ACCESSION',
                           'MODEL_NAME',
                           'MODEL_ACCESSION']:
                    continue
                attributes[key] = fields[_COLUMNS.index(key)]

            # cmscan works on multiple sequence in one FASTA file. We want to
            # yield a separate object for each sequence, thus we create a new
            # one whenever the ID changes
            if (currentsequence != attributes['SEQUENCE_NAME'] and
                    currentsequence is not False):
                yield annotations
                annotations = {}

            # a metadata interval is made out of 'frozen dictionary' aka
            # Features object, where the its hashable value constitutes the
            # key and the value is a list of tuples, aka intervals
            annotations[Feature(attributes)] = [(hitStart, hitEnd)]
            # store current sequence id for the next iteration
            currentsequence = attributes['SEQUENCE_NAME']

    yield annotations