Reference --------- .. [#] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt ''' from skbio.io import create_format, FileFormatError from skbio.sequence import Sequence, DNA, RNA, Protein from skbio.io.format._base import (_line_generator, _get_nth_sequence, _too_many_blanks) class EMBLFormatError(FileFormatError): pass embl = create_format('embl') # This list is ordered. From EMBL specification _HEADERS = [ 'ID', # identification (begins each entry; 1 per entry) 'AC', # accession number (>=1 per entry) 'PR', # project identifier (0 or 1 per entry) 'DT', # date (2 per entry) 'DE', # description (>=1 per entry) 'KW', # keyword (>=1 per entry) 'OS', # organism species (>=1 per entry) 'OC', # organism classification (>=1 per entry) 'OG', # organelle (0 or 1 per entry) 'RN', # reference number (>=1 per entry) 'RC', # reference comment (>=0 per entry) 'RP', # reference positions (>=1 per entry)
from __future__ import (absolute_import, division, print_function, unicode_literals) from future.builtins import zip, range from skbio.io import create_format, QSeqFormatError from skbio.io.format._base import _decode_qual_to_phred, _get_nth_sequence from skbio.alignment import SequenceCollection from skbio.sequence import Sequence, DNA, RNA, Protein _default_phred_offset = None _default_variant = None _will_filter = True qseq = create_format('qseq') @qseq.sniffer() def _qseq_sniffer(fh): empty = True try: for _, line in zip(range(10), fh): _record_parser(line) empty = False return not empty, {} except QSeqFormatError: return False, {} @qseq.reader(None)
""" # ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import pandas as pd from skbio.io import create_format, BLAST7FormatError from skbio.io.format._blast import _parse_blast_data blast7 = create_format('blast+7') column_converter = { 'query id': 'qseqid', 'query gi': 'qgi', 'query acc.': 'qacc', 'query acc.ver': 'qaccver', 'query length': 'qlen', 'subject id': 'sseqid', 'subject ids': 'sallseqid', 'subject gi': 'sgi', 'subject gis': 'sallgi', 'subject acc.': 'sacc', 'subject acc.ver': 'saccver', 'subject accs.': 'sallacc', 'subject length': 'slen',
# # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import (absolute_import, division, print_function, unicode_literals) import functools import pandas as pd from skbio.io import create_format blast6 = create_format('blast+6') _possible_columns = {'qseqid': str, 'qgi': float, 'qacc': str, 'qaccver': str, 'qlen': float, 'sseqid': str, 'sallseqid': str, 'sgi': float, 'sallgi': float, 'sacc': str, 'saccver': str, 'sallacc': str, 'slen': float, 'qstart': float, 'qend': float, 'sstart': float, 'send': float, 'qseq': str, 'sseq': str, 'evalue': float, 'bitscore': float, 'score': float, 'length': float, 'pident': float, 'nident': float, 'mismatch': float, 'positive': float, 'gapopen': float, 'gaps': float, 'ppos': float, 'frames': str, 'qframe': float, 'sframe': float, 'btop': float, 'staxids': str, 'sscinames': str, 'scomnames': str, 'sblastnames': str, 'sskingdoms': str, 'stitle': str, 'salltitles': str, 'sstrand': str, 'qcovs': float,
# ---------------------------------------------------------------------------- # Copyright (c) 2015--, micronota development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import re from skbio.metadata import IntervalMetadata from skbio.io import create_format from ..util import split, split_head transtermhp = create_format('transtermhp') @transtermhp.reader(None) def _generator(fh): '''Parse the annotation and add it to interval metadata. Parameters ---------- f : str the file path from prediction Yield ----- tuple of str and IntervalMetadata seq_id and interval metadata
# # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import (absolute_import, division, print_function, unicode_literals) from skbio.alignment import Alignment from skbio.sequence import Sequence from skbio.io import create_format, PhylipFormatError from skbio.util._misc import chunk_str phylip = create_format('phylip') @phylip.sniffer() def _phylip_sniffer(fh): # Strategy: # Read the header and a single sequence; verify that the sequence length # matches the header information. Do not verify that the total number of # lines matches the header information, since that would require reading # the whole file. try: header = next(_line_generator(fh)) _, seq_len = _validate_header(header) line = next(_line_generator(fh)) _validate_line(line, seq_len) except (StopIteration, PhylipFormatError):
# ---------------------------------------------------------------------------- # Copyright (c) 2015--, micronota development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from skbio.metadata import IntervalMetadata from skbio.io import create_format from ..util import split, split_head tandem_repeats_finder = create_format('tandem_repeats_finder') @tandem_repeats_finder.reader(None) def _generator(fh): '''Parse the annotation and add it to interval metadata. Parameters ---------- fp : str the file path from Tandem Repeat Finder prediction Yield ----- tuple of str and IntervalMetadata seq_id and interval metadata ''' splitter = split(split_head, is_head=lambda line: line.startswith('@'))
# Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import (absolute_import, division, print_function, unicode_literals) import pandas as pd from skbio.io import create_format from skbio.io.format._blast import _parse_blast_data, _possible_columns blast6 = create_format('blast+6') _default_columns = [ 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore' ] @blast6.reader(pd.DataFrame, monkey_patch=False) def _blast6_to_data_frame(fh, columns=None, default_columns=False): if default_columns and columns is not None: raise ValueError("`columns` and `default_columns` cannot both be" " provided.") if not default_columns and columns is None: raise ValueError("Either `columns` or `default_columns` must be" " provided.")
import re from skbio.io import create_format from skbio.metadata import IntervalMetadata from ..util import split, split_head aragorn = create_format('aragorn') @aragorn.reader(None) def _generator(fh): # aragorn output has a final summary line like this: # >end 5 sequences 97 tRNA genes 1 tmRNA genes # This line should be skipped and not parsed p = re.compile(r'>end\s+\d+ sequences \d+ tRNA genes \d+ tmRNA genes') splitter = split(split_head) for lines in splitter(fh): headline = lines[0] if p.match(headline): return sid = headline.split(None, 1)[0][1:] yield sid, _parse_record(lines[2:]) def _parse_record(lines): '''Return interval metadata.''' imd = IntervalMetadata(None) for line in lines: bounds, md = _parse_line(line) imd.add(bounds, metadata=md)
# Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import (absolute_import, division, print_function, unicode_literals) from future.builtins import zip import numpy as np import pandas as pd from skbio._base import OrdinationResults from skbio.io import create_format, OrdinationFormatError ordination = create_format('ordination') @ordination.sniffer() def _ordination_sniffer(fh): # Smells an ordination file if *all* of the following lines are present # *from the beginning* of the file: # - eigvals header (minimally parsed) # - another line (contents ignored) # - a whitespace-only line # - proportion explained header (minimally parsed) try: _parse_header(fh, 'Eigvals', 1) next_line = next(fh, None) if next_line is not None:
# The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import absolute_import, division, print_function, unicode_literals from future.builtins import zip, range from skbio.io import create_format, QSeqFormatError from skbio.io.format._base import _decode_qual_to_phred, _get_nth_sequence from skbio.sequence import Sequence, DNA, RNA, Protein _default_phred_offset = None _default_variant = None _will_filter = True qseq = create_format("qseq") @qseq.sniffer() def _qseq_sniffer(fh): empty = True try: for _, line in zip(range(10), fh): _record_parser(line) empty = False return not empty, {} except QSeqFormatError: return False, {} @qseq.reader(None)
# ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import absolute_import, division, print_function, unicode_literals from skbio.alignment import Alignment from skbio.io import create_format, PhylipFormatError from skbio.util._misc import chunk_str phylip = create_format("phylip") @phylip.writer(Alignment) def _alignment_to_phylip(obj, fh): if obj.is_empty(): raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one sequence in the alignment." ) sequence_length = obj.sequence_length() if sequence_length == 0: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one position in the alignment." )
|Yes |No |generator of :mod:`skbio.sequence.Sequence` objects | +------+------+---------------------------------------------------------------+ Reference --------- .. [#] https://samtools.github.io/hts-specs/SAMv1.pdf ''' from skbio.io import create_format from skbio.sequence import Sequence, DNA, RNA, Protein from skbio.io.format._base import ( _line_generator, _get_nth_sequence, _too_many_blanks) sam = create_format('sam') # Alignment headers _REQUIRED_FIELDS = [ 'QNAME', # Query template NAME. 'FLAG', # Combination of bitwise FLAGs 'RNAME', # Reference sequence NAME of the alignment 'POS', # 1-based leftmost mapping position of the first base 'MAPQ', # Mapping quality. -10log10(P_err). 'CIGAR', # CIGAR string 'RNEXT', # Reference sequence name of the primary alignment of NEXT 'PNEXT', # Position of the primary alignment of the NEXT read 'TLEN', # signed observed template length 'SEQ', # segment sequence 'QUAL', # ASCII of base quality ]
# The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import re from functools import partial from skbio.io import create_format, GenBankFormatError from skbio.io.format._base import (_get_nth_sequence, _line_generator, _too_many_blanks) from skbio.util._misc import chunk_str from skbio.sequence import Sequence, DNA, RNA, Protein from skbio.io.format._sequence_feature_vocabulary import ( _yield_section, _parse_section_default, _serialize_section_default, _parse_feature_table, _serialize_feature_table) genbank = create_format('genbank') # This list is ordered # used to read and write genbank file. _HEADERS = [ 'LOCUS', 'DEFINITION', 'ACCESSION', 'VERSION', 'DBSOURCE', 'DBLINK', 'KEYWORDS', 'SOURCE', 'REFERENCE', 'COMMENT', 'FEATURES', 'ORIGIN' ] @genbank.sniffer() def _genbank_sniffer(fh): # check the 1st real line is a valid LOCUS line if _too_many_blanks(fh, 5): return False, {} try:
# ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import numpy as np import pandas as pd from skbio._base import OrdinationResults from skbio.io import create_format, OrdinationFormatError ordination = create_format('ordination') @ordination.sniffer() def _ordination_sniffer(fh): # Smells an ordination file if *all* of the following lines are present # *from the beginning* of the file: # - eigvals header (minimally parsed) # - another line (contents ignored) # - a whitespace-only line # - proportion explained header (minimally parsed) try: _parse_header(fh, 'Eigvals', 1) next_line = next(fh, None) if next_line is not None:
.. [2] http://evolution.genetics.washington.edu/phylip/newicktree.html """ # ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from skbio.io import create_format, NewickFormatError from skbio.tree import TreeNode newick = create_format('newick') @newick.sniffer() def _newick_sniffer(fh): # Strategy: # The following conditions preclude a file from being newick: # * It is an empty file. # * There is whitespace inside of a label (handled by tokenizer) # * : is followed by anything that is an operator # * ( is not preceded immediately by , or another ( # * The parens are unablanced when ; is found. # If 100 tokens (or less if EOF occurs earlier) then it is probably # newick, or at least we can't prove it isn't. operators = set(",;:()") empty = True
# ---------------------------------------------------------------------------- import re from collections import Iterable from skbio.sequence import DNA, Sequence from skbio.io import create_format, GFF3FormatError from skbio.metadata import IntervalMetadata from skbio.io.format._base import (_line_generator, _too_many_blanks, _get_nth_sequence) from skbio.io.format.fasta import _fasta_to_generator from skbio.io.format._sequence_feature_vocabulary import (_vocabulary_change, _vocabulary_skip) from skbio.io import write gff3 = create_format('gff3') @gff3.sniffer() def _gff3_sniffer(fh): # check the 1st real line is a valid ID line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} if re.match(r'##gff-version\s+3', line): return True, {}
# ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from collections import OrderedDict from skbio.alignment import TabularMSA from skbio.sequence._grammared_sequence import GrammaredSequence from skbio.io import create_format, StockholmFormatError stockholm = create_format('stockholm') @stockholm.sniffer() def _stockholm_sniffer(fh): # Smells a Stockholm file if the following conditions are met: # - File isn't empty # - File contains correct header try: line = next(fh) except StopIteration: return False, {} if _is_header(line): return True, {}
Reference --------- .. [#] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt ''' from skbio.io import create_format, FileFormatError from skbio.sequence import Sequence, DNA, RNA, Protein from skbio.io.format._base import ( _line_generator, _get_nth_sequence, _too_many_blanks) class EMBLFormatError(FileFormatError): pass embl = create_format('embl') # This list is ordered. From EMBL specification _HEADERS = ['ID', # identification (begins each entry; 1 per entry) 'AC', # accession number (>=1 per entry) 'PR', # project identifier (0 or 1 per entry) 'DT', # date (2 per entry) 'DE', # description (>=1 per entry) 'KW', # keyword (>=1 per entry) 'OS', # organism species (>=1 per entry) 'OC', # organism classification (>=1 per entry) 'OG', # organelle (0 or 1 per entry) 'RN', # reference number (>=1 per entry) 'RC', # reference comment (>=0 per entry) 'RP', # reference positions (>=1 per entry) 'RX', # reference cross-reference (>=0 per entry)
import textwrap import numpy as np from skbio.io import create_format, FASTAFormatError, QUALFormatError from skbio.io.registry import FileSentinel from skbio.io.format._base import (_get_nth_sequence, _parse_fasta_like_header, _format_fasta_like_records, _line_generator, _too_many_blanks) from skbio.util._misc import chunk_str from skbio.alignment import TabularMSA from skbio.sequence import Sequence, DNA, RNA, Protein fasta = create_format('fasta') @fasta.sniffer() def _fasta_sniffer(fh): # Strategy: # Ignore up to 5 blank/whitespace-only lines at the beginning of the # file. Read up to 10 records. If at least one record is read (i.e. # the file isn't empty) and no errors are thrown during reading, assume # the file is in FASTA format. If a record appears to be QUAL, do *not* # identify the file as FASTA since we don't want to sniff QUAL files as # FASTA (technically they can be read as FASTA since the sequences may # not be validated but it probably isn't what the user wanted). Also, if # we add QUAL as its own file format in the future, we wouldn't want the # FASTA and QUAL sniffers to both positively identify a QUAL file. if _too_many_blanks(fh, 5):
""" # ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import pandas as pd from skbio.io import create_format from skbio.io.format._blast import _parse_blast_data, _possible_columns blast6 = create_format("blast+6") _default_columns = [ "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", ]
**Has Sniffer: Yes** Format Specification -------------------- An empty file consists of only whitespace characters. """ # ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import (absolute_import, division, print_function, unicode_literals) from skbio.io import create_format emptyfile = create_format('<emptyfile>') @emptyfile.sniffer() def _empty_file_sniffer(fh): for line in fh: if line.strip(): return False, {} return True, {}
# ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import csv import numpy as np from skbio.stats.distance import DissimilarityMatrix, DistanceMatrix from skbio.io import create_format, LSMatFormatError lsmat = create_format('lsmat') @lsmat.sniffer() def _lsmat_sniffer(fh): header = _find_header(fh) if header is not None: try: dialect = csv.Sniffer().sniff(header) delimiter = dialect.delimiter ids = _parse_header(header, delimiter) first_id, _ = next(_parse_data(fh, delimiter), (None, None)) if first_id is not None and first_id == ids[0]:
import re from collections import Iterable from skbio.sequence import DNA, Sequence from skbio.io import create_format, GFF3FormatError from skbio.metadata import IntervalMetadata from skbio.io.format._base import ( _line_generator, _too_many_blanks, _get_nth_sequence) from skbio.io.format.fasta import _fasta_to_generator from skbio.io.format._sequence_feature_vocabulary import ( _vocabulary_change, _vocabulary_skip) from skbio.io import write gff3 = create_format('gff3') @gff3.sniffer() def _gff3_sniffer(fh): # check the 1st real line is a valid ID line if _too_many_blanks(fh, 5): return False, {} try: line = next(_line_generator(fh, skip_blanks=True, strip=False)) except StopIteration: return False, {} if re.match(r'##gff-version\s+3', line): return True, {}
# Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import absolute_import, division, print_function, unicode_literals from collections import OrderedDict from skbio.alignment import TabularMSA from skbio.sequence._iupac_sequence import IUPACSequence from skbio.io import create_format, StockholmFormatError stockholm = create_format("stockholm") @stockholm.sniffer() def _stockholm_sniffer(fh): # Smells a Stockholm file if the following conditions are met: # - File isn't empty # - File contains correct header try: line = next(fh) except StopIteration: return False, {} if _is_header(line): return True, {}
# ---------------------------------------------------------------------------- # Copyright (c) 2015--, micronota development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from logging import getLogger from skbio.metadata import IntervalMetadata from skbio.io import create_format from ..util import split, SplitterID rnammer = create_format('rnammer') @rnammer.reader(None) def _generator(fh): '''Parse the annotation and add it to interval metadata. Parameters ---------- fn : str the file name from RNAmmer prediction Yield ----- tuple of str and IntervalMetadata seq_id and interval metadata
from six.moves import zip_longest import textwrap import numpy as np from skbio.io import create_format, FASTAFormatError, QUALFormatError from skbio.io.registry import FileSentinel from skbio.io.format._base import (_get_nth_sequence, _parse_fasta_like_header, _format_fasta_like_records, _line_generator, _too_many_blanks) from skbio.util._misc import chunk_str from skbio.alignment import SequenceCollection, Alignment from skbio.sequence import Sequence, DNA, RNA, Protein fasta = create_format('fasta') @fasta.sniffer() def _fasta_sniffer(fh): # Strategy: # Ignore up to 5 blank/whitespace-only lines at the beginning of the # file. Read up to 10 records. If at least one record is read (i.e. # the file isn't empty) and no errors are thrown during reading, assume # the file is in FASTA format. If a record appears to be QUAL, do *not* # identify the file as FASTA since we don't want to sniff QUAL files as # FASTA (technically they can be read as FASTA since the sequences may # not be validated but it probably isn't what the user wanted). Also, if # we add QUAL as its own file format in the future, we wouldn't want the # FASTA and QUAL sniffers to both positively identify a QUAL file. if _too_many_blanks(fh, 5):
# ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import numpy as np import pandas as pd from skbio.stats.ordination import OrdinationResults from skbio.io import create_format, OrdinationFormatError ordination = create_format("ordination") @ordination.sniffer() def _ordination_sniffer(fh): # Smells an ordination file if *all* of the following lines are present # *from the beginning* of the file: # - eigvals header (minimally parsed) # - another line (contents ignored) # - a whitespace-only line # - proportion explained header (minimally parsed) try: _parse_header(fh, "Eigvals", 1) next_line = next(fh, None) if next_line is not None:
Reference --------- .. [1] Eric P. Nawrocki and Sean R. Eddy, "Infernal 1.1: 100-fold faster RNA homology searches", Bioinformatics 2013, doi: 10.1093/bioinformatics/btt509 ''' from skbio.io import create_format, FileFormatError from skbio.metadata import IntervalMetadata, Feature from skbio.io.format._base import (_line_generator, _too_many_blanks) from skbio.io.format._base import _get_nth_sequence as _get_nth_record cmscan = create_format('cmscan') # column headers _COLUMNS = ['MODEL_NAME', 'MODEL_ACCESSION', 'SEQUENCE_NAME', 'SEQUENCE_ACCESSION', 'TYPE_OF_MODEL', 'MODEL_START_POSITION', 'MODEL_END_POSITION', 'SEQUENCE_START_POSITION', 'SEQUENCE_END_POSITION', 'STRAND', 'TRUNCATED', 'PASS', 'GC_CONTENT', 'BIAS', 'BITSCORE', 'EVALUE', 'INC', 'DESCRIPTION'] class CmscanFormatError(FileFormatError): pass def _construct(record, constructor=None, **kwargs): if constructor is None:
""" # ---------------------------------------------------------------------------- # Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from skbio.alignment import TabularMSA from skbio.io import create_format, PhylipFormatError from skbio.util._misc import chunk_str phylip = create_format('phylip') @phylip.sniffer() def _phylip_sniffer(fh): # Strategy: # Read the header and a single sequence; verify that the sequence length # matches the header information. Do not verify that the total number of # lines matches the header information, since that would require reading # the whole file. try: header = next(_line_generator(fh)) _, seq_len = _validate_header(header) line = next(_line_generator(fh)) _validate_line(line, seq_len) except (StopIteration, PhylipFormatError):
import re import numpy as np from skbio.io import create_format, FASTQFormatError from skbio.io.format._base import ( _decode_qual_to_phred, _encode_phred_to_qual, _get_nth_sequence, _parse_fasta_like_header, _format_fasta_like_records, _line_generator, _too_many_blanks) from skbio.alignment import SequenceCollection, Alignment from skbio.sequence import Sequence, DNA, RNA, Protein _whitespace_regex = re.compile(r'\s') fastq = create_format('fastq') @fastq.sniffer() def _fastq_sniffer(fh): # Strategy: # Ignore up to 5 blank/whitespace-only lines at the beginning of the # file. Read up to 10 records. If at least one record is read (i.e. the # file isn't empty) and the quality scores are in printable ASCII range, # assume the file is FASTQ. if _too_many_blanks(fh, 5): return False, {} try: not_empty = False for _ in zip(range(10), _fastq_to_generator(fh, phred_offset=33)):
# Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import (absolute_import, division, print_function, unicode_literals) import pandas as pd from skbio.io import create_format, BLAST7FormatError from skbio.io.format._blast import _parse_blast_data blast7 = create_format('blast+7') column_converter = {'query id': 'qseqid', 'query gi': 'qgi', 'query acc.': 'qacc', 'query acc.ver': 'qaccver', 'query length': 'qlen', 'subject id': 'sseqid', 'subject ids': 'sallseqid', 'subject gi': 'sgi', 'subject gis': 'sallgi', 'subject acc.': 'sacc', 'subject acc.ver': 'saccver', 'subject accs.': 'sallacc', 'subject length': 'slen', 'q. start': 'qstart', 'q. end': 'qend', 's. start': 'sstart', 's. end': 'send', 'query seq': 'qseq', 'subject seq': 'sseq', 'evalue': 'evalue', 'bit score': 'bitscore', 'score': 'score', 'alignment length': 'length', '% identity': 'pident', 'identical': 'nident', 'mismatches': 'mismatch', 'positives': 'positive', 'gap opens': 'gapopen', 'gaps': 'gaps',
import re import numpy as np from skbio.io import create_format, FASTQFormatError from skbio.io.format._base import (_decode_qual_to_phred, _encode_phred_to_qual, _get_nth_sequence, _parse_fasta_like_header, _format_fasta_like_records, _line_generator, _too_many_blanks) from skbio.alignment import SequenceCollection, Alignment from skbio.sequence import Sequence, DNA, RNA, Protein _whitespace_regex = re.compile(r'\s') fastq = create_format('fastq') @fastq.sniffer() def _fastq_sniffer(fh): # Strategy: # Ignore up to 5 blank/whitespace-only lines at the beginning of the # file. Read up to 10 records. If at least one record is read (i.e. the # file isn't empty) and the quality scores are in printable ASCII range, # assume the file is FASTQ. if _too_many_blanks(fh, 5): return False, {} try: not_empty = False for _ in zip(range(10), _fastq_to_generator(fh, phred_offset=33)):
# Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import csv import numpy as np from skbio.stats.distance import DissimilarityMatrix, DistanceMatrix from skbio.io import create_format, LSMatFormatError lsmat = create_format('lsmat') @lsmat.sniffer() def _lsmat_sniffer(fh): header = _find_header(fh) if header is not None: try: dialect = csv.Sniffer().sniff(header) delimiter = dialect.delimiter ids = _parse_header(header, delimiter) first_id, _ = next(_parse_data(fh, delimiter), (None, None)) if first_id is not None and first_id == ids[0]:
# The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import re import numpy as np import pandas as pd from functools import partial from skbio.io import create_format, GenBankFormatError from skbio.io.format._base import ( _get_nth_sequence, _line_generator, _too_many_blanks) from skbio.util._misc import chunk_str from skbio.sequence import Sequence, DNA, RNA, Protein genbank = create_format('genbank') # This list is ordered # used to read and write genbank file. _HEADERS = ['LOCUS', 'DEFINITION', 'ACCESSION', 'VERSION', 'DBSOURCE', 'DBLINK', 'KEYWORDS', 'SOURCE', 'REFERENCE', 'COMMENT', 'FEATURES', 'ORIGIN']
# Copyright (c) 2013--, scikit-bio development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- from __future__ import (absolute_import, division, print_function, unicode_literals) from skbio.io import create_format, ClustalFormatError from skbio.sequence import Sequence from skbio.alignment import Alignment clustal = create_format('clustal') def _label_line_parser(record, strict=True): """Returns dict mapping list of data to labels, plus list with field order. Field order contains labels in order encountered in file. NOTE: doesn't care if lines are out of order in different blocks. This should never happen anyway, but it's possible that this behavior should be changed to tighten up validation. """ labels = [] result = {} for line in record: split_line = line.strip().rsplit(None, 1)
+------+------+---------------------------------------------------------------+ |Yes |No |generator of :mod:`skbio.sequence.Sequence` objects | +------+------+---------------------------------------------------------------+ Reference --------- .. [#] https://samtools.github.io/hts-specs/SAMv1.pdf ''' from skbio.io import create_format from skbio.sequence import Sequence, DNA, RNA, Protein from skbio.io.format._base import (_line_generator, _get_nth_sequence, _too_many_blanks) sam = create_format('sam') # Alignment headers _REQUIRED_FIELDS = [ 'QNAME', # Query template NAME. 'FLAG', # Combination of bitwise FLAGs 'RNAME', # Reference sequence NAME of the alignment 'POS', # 1-based leftmost mapping position of the first base 'MAPQ', # Mapping quality. -10log10(P_err). 'CIGAR', # CIGAR string 'RNEXT', # Reference sequence name of the primary alignment of NEXT 'PNEXT', # Position of the primary alignment of the NEXT read 'TLEN', # signed observed template length 'SEQ', # segment sequence 'QUAL', # ASCII of base quality ]