def to_file(self, out_f, delimiter='\t'): """Save the dissimilarity matrix to file in delimited text format. Parameters ---------- out_f : file-like object or filename File-like object to write serialized data to, or name of file. If it's a file-like object, it must have a ``write`` method, and it won't be closed. Else, it is opened and closed after writing. delimiter : str, optional Delimiter used to separate elements in output format. See Also -------- from_file """ with open_file(out_f, 'w') as out_f: formatted_ids = self._format_ids(delimiter) out_f.write(formatted_ids) out_f.write('\n') for id_, vals in zip(self.ids, self.data): out_f.write(id_) out_f.write(delimiter) out_f.write(delimiter.join(np.asarray(vals, dtype=np.str))) out_f.write('\n')
def test_filehandle(self): """Filehandles slip through untouched""" with tempfile.TemporaryFile('r') as fh: with open_file(fh) as ffh: self.assertTrue(fh is ffh) # And it doesn't close the file-handle self.assertFalse(fh.closed)
def test_file_closed(self): """File gets closed in decorator""" f = tempfile.NamedTemporaryFile('r') filepath = f.name with open_file(filepath) as fh: pass self.assertTrue(fh.closed)
def looks_like_qiime_mapping_file(fp): """Checks if the file looks like a QIIME mapping file Parameters ---------- fp : str or file-like object filepath to check if it looks like a QIIME mapping file Returns ------- bool True if fp looks like a QIIME mapping file, false otherwise. Notes ----- This is not doing a validation of the QIIME mapping file. It simply checks the first line in the file and it returns true if the line starts with '#SampleID', since a sample/prep template will start with 'sample_name' or some other different column. """ first_line = None with open_file(fp, mode='U') as f: first_line = f.readline() if not first_line: return False first_col = first_line.split()[0] return first_col == '#SampleID'
def load_mf(fn): from skbio.io.util import open_file from emperor.qiime_backports.parse import parse_mapping_file with open_file(fn) as f: mapping_data, header, _ = parse_mapping_file(f) _mapping_file = pd.DataFrame(mapping_data, columns=header) _mapping_file.set_index('SampleID', inplace=True) return _mapping_file
def test_filehandle(self): """Filehandles slip through untouched""" with tempfile.TemporaryFile('r') as fh: with tempfile.TemporaryFile('r') as fh2: with open_file([fh, fh2]) as fhs: self.assertTrue(fh is fhs[0]) self.assertTrue(fh2 is fhs[1]) # And it doesn't close the file-handle for fh in fhs: self.assertFalse(fh.closed)
def test_file_closed_harder(self): """File gets closed in decorator, even if exceptions happen.""" f = tempfile.NamedTemporaryFile('r') filepath = f.name try: with open_file(filepath) as fh: raise TypeError except TypeError: self.assertTrue(fh.closed) else: # If we're here, no exceptions have been raised inside the # try clause, so the context manager swallowed them. No # good. raise Exception("`open_file` didn't propagate exceptions")
def parse_items(fp): """Parse items from a file where each item is in a different line Parameters ---------- fp : str/bytes/unicode string or file-like Filepath or file-like object to parse. Returns ------- list List of the items parsed from the file """ with open_file(fp, 'U') as f: items = f.read().strip('\n').split('\n') if items == ['']: items = [] return items
def parser(lines): with open_file(lines) as lines: curr = [] for l in lines: try: l = str(l.decode("utf-8")) except AttributeError: pass if constructor is not None: line = constructor(l) else: line = l if ignore(line): continue # if we find the label, return the previous record if is_label_line(line): if curr: yield curr curr = [] curr.append(line) # don't forget to return the last record in the file if curr: yield curr
def parser(lines): with open_file(lines) as lines: curr = [] for l in lines: try: l = str(l.decode('utf-8')) except AttributeError: pass if constructor is not None: line = constructor(l) else: line = l if ignore(line): continue # if we find the label, return the previous record if is_label_line(line): if curr: yield curr curr = [] curr.append(line) # don't forget to return the last record in the file if curr: yield curr
def load_template_to_dataframe(fn, index='sample_name'): """Load a sample/prep template or a QIIME mapping file into a data frame Parameters ---------- fn : str or file-like object filename of the template to load, or an already open template file index : str, optional Defaults to 'sample_name'. The index to use in the loaded information Returns ------- DataFrame Pandas dataframe with the loaded information Raises ------ ValueError Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. QiitaDBWarning When columns are dropped because they have no content for any sample. QiitaDBError When non UTF-8 characters are found in the file. QiitaDBDuplicateHeaderError If duplicate columns are present in the template Notes ----- The index attribute of the DataFrame will be forced to be 'sample_name' and will be cast to a string. Additionally rows that start with a '\t' character will be ignored and columns that are empty will be removed. Empty sample names will be removed from the DataFrame. Column names are case-insensitive but will be lowercased on addition to the database Everything in the DataFrame will be read and managed as string """ # Load in file lines holdfile = None with open_file(fn, mode='U') as f: holdfile = f.readlines() if not holdfile: raise ValueError('Empty file passed!') if index == "#SampleID": # We're going to parse a QIIME mapping file. We are going to first # parse it with the QIIME function so we can remove the comments # easily and make sure that QIIME will accept this as a mapping file data, headers, comments = _parse_mapping_file(holdfile) holdfile = ["%s\n" % '\t'.join(d) for d in data] holdfile.insert(0, "%s\n" % '\t'.join(headers)) # The QIIME parser fixes the index and removes the # index = 'SampleID' # Strip all values in the cells in the input file for pos, line in enumerate(holdfile): cols = line.split('\t') if pos == 0 and index != 'SampleID': # get and clean the controlled columns ccols = {'sample_name'} ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS) newcols = [ c.lower().strip() if c.lower().strip() in ccols else c.strip() for c in cols] # while we are here, let's check for duplicate columns headers if len(set(newcols)) != len(newcols): raise qdb.exceptions.QiitaDBDuplicateHeaderError( find_duplicates(newcols)) else: # .strip will remove odd chars, newlines, tabs and multiple # spaces but we need to read a new line at the end of the # line(+'\n') newcols = [d.strip(" \r\x0b\x0c\n") for d in cols] holdfile[pos] = '\t'.join(newcols) + '\n' # index_col: # is set as False, otherwise it is cast as a float and we want a string # keep_default: # is set as False, to avoid inferring empty/NA values with the defaults # that Pandas has. # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. try: template = pd.read_csv( StringIO(''.join(holdfile)), sep='\t', dtype=str, encoding='utf-8', infer_datetime_format=False, keep_default_na=False, index_col=False, comment='\t', converters={index: lambda x: str(x).strip()}) # remove newlines and tabs from fields template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='', regex=True, inplace=True) except UnicodeDecodeError: # Find row number and col number for utf-8 encoding errors headers = holdfile[0].strip().split('\t') errors = defaultdict(list) for row, line in enumerate(holdfile, 1): for col, cell in enumerate(line.split('\t')): try: cell.encode('utf-8') except UnicodeError: errors[headers[col]].append(row) lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows))) for header, rows in viewitems(errors)] raise qdb.exceptions.QiitaDBError( 'Non UTF-8 characters found in columns:\n' + '\n'.join(lines)) initial_columns = set(template.columns) if index not in template.columns: raise qdb.exceptions.QiitaDBColumnError( "The '%s' column is missing from your template, this file cannot " "be parsed." % index) # remove rows that have no sample identifier but that may have other data # in the rest of the columns template.dropna(subset=[index], how='all', inplace=True) # set the sample name as the index template.set_index(index, inplace=True) # it is not uncommon to find templates that have empty columns so let's # find the columns that are all '' columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0)) template.drop(template.columns[columns], axis=1, inplace=True) initial_columns.remove(index) dropped_cols = initial_columns - set(template.columns) if dropped_cols: warnings.warn( 'The following column(s) were removed from the template because ' 'all their values are empty: %s' % ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning) # Pandas represents data with np.nan rather than Nones, change it to None # because psycopg2 knows that a None is a Null in SQL, while it doesn't # know what to do with NaN template = template.where((pd.notnull(template)), None) return template
def parse_qseq(infile, phred_offset=33): r"""Generator of seq ids, seqs, quals and other records from a qseq file. Parameters ---------- infile : open file object or str An open qseq file or a path to a qseq file. phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers. Returns ------- four-item tuple: (str, str, np.array(dtype=int), namedtuple) yields the sequence id, sequence, qual array and other record information for each entry. The sequence ID format is: <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y># <Index>/<Read number>. The namedtuple attributes are: machine_name, run, lane, tile, x, y, index, read and filtered. Examples -------- Assume we have a qseq-formatted file with the following contents:: CRESSIA 242 1 2204 1453 1918 0 1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA B[[[W][Y[Zccccccccc\cccac_____ 1 CRESSIA 242 1 2204 1490 1921 0 2 ..GTAAAACCCATATATTGAAAACTACAAA BWUTWcXVXXcccc_cccccccccc_cccc 1 >>> from six import StringIO >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t' ... '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n' ... 'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t' ... '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n' ... ) We can parse this as follows: >>> from skbio import parse_qseq >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64): ... print(seq_id) ... print(seq) ... print(qual[:10]) ... print(record.run) ... print(record.lane) CRESSIA_242:1:2204:1453:1918#0/1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA [ 2 27 27 27 23 29 27 25 27 26] 242 1 CRESSIA_242:1:2204:1490:1921#0/2 ..GTAAAACCCATATATTGAAAACTACAAA [ 2 23 21 20 23 35 24 22 24 24] 242 1 """ if phred_offset == 33: phred_f = ascii_to_phred33 elif phred_offset == 64: phred_f = ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) # namedtuple to store all other record information Record = collections.namedtuple( 'Record', ['machine_name', 'run', 'lane', 'tile', 'x', 'y', 'index', 'read', 'filtered']) with open_file(infile) as lines: for rec in lines: try: rec = str(rec.decode('utf-8')) except AttributeError: pass # parse record. try: (machine_name, run, lane, tile, x, y, index, read, seq, qual, filtered) = rec.split() except ValueError: raise QseqParseError("Invalid QSEQ record found.") # sequence ID is formatted using the first eight items. seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % ( machine_name, run, lane, tile, x, y, index, read) # qual string is converted to an array of ints. qual = phred_f(qual) # other items are returned as a namedtuple record = Record( machine_name=machine_name, run=int(run), lane=int(lane), tile=int(tile), x=int(x), y=int(y), index=int(index), read=int(read), filtered=bool(int(filtered))) yield seq_id, seq, qual, record
def parse_qseq(infile, phred_offset=33): r"""Generator of seq ids, seqs, quals and other records from a qseq file. Parameters ---------- infile : open file object or str An open qseq file or a path to a qseq file. phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers. Returns ------- four-item tuple: (str, str, np.array(dtype=int), namedtuple) yields the sequence id, sequence, qual array and other record information for each entry. The sequence ID format is: <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y># <Index>/<Read number>. The namedtuple attributes are: machine_name, run, lane, tile, x, y, index, read and filtered. Examples -------- Assume we have a qseq-formatted file with the following contents:: CRESSIA 242 1 2204 1453 1918 0 1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA B[[[W][Y[Zccccccccc\cccac_____ 1 CRESSIA 242 1 2204 1490 1921 0 2 ..GTAAAACCCATATATTGAAAACTACAAA BWUTWcXVXXcccc_cccccccccc_cccc 1 >>> from future.utils.six import StringIO >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t' ... '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n' ... 'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t' ... '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n' ... ) We can parse this as follows: >>> from skbio import parse_qseq >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64): ... print(seq_id) ... print(seq) ... print(qual[:10]) ... print(record.run) ... print(record.lane) CRESSIA_242:1:2204:1453:1918#0/1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA [ 2 27 27 27 23 29 27 25 27 26] 242 1 CRESSIA_242:1:2204:1490:1921#0/2 ..GTAAAACCCATATATTGAAAACTACAAA [ 2 23 21 20 23 35 24 22 24 24] 242 1 """ if phred_offset == 33: phred_f = ascii_to_phred33 elif phred_offset == 64: phred_f = ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) # namedtuple to store all other record information Record = collections.namedtuple( 'Record', ['machine_name', 'run', 'lane', 'tile', 'x', 'y', 'index', 'read', 'filtered']) with open_file(infile) as lines: for rec in lines: try: rec = str(rec.decode('utf-8')) except AttributeError: pass # parse record. try: (machine_name, run, lane, tile, x, y, index, read, seq, qual, filtered) = rec.split() except ValueError: raise QseqParseError("Invalid QSEQ record found.") # sequence ID is formatted using the first eight items. seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % ( machine_name, run, lane, tile, x, y, index, read) # qual string is converted to an array of ints. qual = phred_f(qual) # other items are returned as a namedtuple record = Record( machine_name=machine_name, run=int(run), lane=int(lane), tile=int(tile), x=int(x), y=int(y), index=int(index), read=int(read), filtered=bool(int(filtered))) yield seq_id, seq, qual, record
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'): """Load a sample/prep template or a QIIME mapping file into a data frame Parameters ---------- fn : str or file-like object filename of the template to load, or an already open template file strip_whitespace : bool, optional Defaults to True. Whether or not to strip whitespace from values in the input file index : str, optional Defaults to 'sample_name'. The index to use in the loaded information Returns ------- DataFrame Pandas dataframe with the loaded information Raises ------ ValueError Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. QiitaDBWarning When columns are dropped because they have no content for any sample. QiitaDBError When non UTF-8 characters are found in the file. QiitaDBDuplicateHeaderError If duplicate columns are present in the template Notes ----- The index attribute of the DataFrame will be forced to be 'sample_name' and will be cast to a string. Additionally rows that start with a '\t' character will be ignored and columns that are empty will be removed. Empty sample names will be removed from the DataFrame. Column names are case-insensitive but will be lowercased on addition to the database Everything in the DataFrame will be read and managed as string """ # Load in file lines holdfile = None with open_file(fn, mode='U') as f: holdfile = f.readlines() if not holdfile: raise ValueError('Empty file passed!') # Strip all values in the cells in the input file, if requested if strip_whitespace: for pos, line in enumerate(holdfile): holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c") for d in line.split('\t')) # get and clean the controlled columns cols = holdfile[0].split('\t') controlled_cols = {'sample_name'} controlled_cols.update(qdb.metadata_template.constants.CONTROLLED_COLS) holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c for c in cols) if index == "#SampleID": # We're going to parse a QIIME mapping file. We are going to first # parse it with the QIIME function so we can remove the comments # easily and make sure that QIIME will accept this as a mapping file data, headers, comments = _parse_mapping_file(holdfile) holdfile = ["%s\n" % '\t'.join(d) for d in data] holdfile.insert(0, "%s\n" % '\t'.join(headers)) # The QIIME parser fixes the index and removes the # index = 'SampleID' # Check that we don't have duplicate columns col_names = [c.lower() for c in holdfile[0].strip().split('\t')] if len(set(col_names)) != len(col_names): raise qdb.exceptions.QiitaDBDuplicateHeaderError( find_duplicates(col_names)) # index_col: # is set as False, otherwise it is cast as a float and we want a string # keep_default: # is set as False, to avoid inferring empty/NA values with the defaults # that Pandas has. # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. try: template = pd.read_csv( StringIO(''.join(holdfile)), sep='\t', dtype=str, encoding='utf-8', infer_datetime_format=False, keep_default_na=False, index_col=False, comment='\t', converters={index: lambda x: str(x).strip()}) except UnicodeDecodeError: # Find row number and col number for utf-8 encoding errors headers = holdfile[0].strip().split('\t') errors = defaultdict(list) for row, line in enumerate(holdfile, 1): for col, cell in enumerate(line.split('\t')): try: cell.encode('utf-8') except UnicodeError: errors[headers[col]].append(row) lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows))) for header, rows in viewitems(errors)] raise qdb.exceptions.QiitaDBError( 'Non UTF-8 characters found in columns:\n' + '\n'.join(lines)) initial_columns = set(template.columns) if index not in template.columns: raise qdb.exceptions.QiitaDBColumnError( "The '%s' column is missing from your template, this file cannot " "be parsed." % index) # remove rows that have no sample identifier but that may have other data # in the rest of the columns template.dropna(subset=[index], how='all', inplace=True) # set the sample name as the index template.set_index(index, inplace=True) # it is not uncommon to find templates that have empty columns so let's # find the columns that are all '' columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0)) template.drop(template.columns[columns], axis=1, inplace=True) initial_columns.remove(index) dropped_cols = initial_columns - set(template.columns) if dropped_cols: warnings.warn( 'The following column(s) were removed from the template because ' 'all their values are empty: %s' % ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning) # Pandas represents data with np.nan rather than Nones, change it to None # because psycopg2 knows that a None is a Null in SQL, while it doesn't # know what to do with NaN template = template.where((pd.notnull(template)), None) return template
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'): """Load a sample/prep template or a QIIME mapping file into a data frame Parameters ---------- fn : str or file-like object filename of the template to load, or an already open template file strip_whitespace : bool, optional Defaults to True. Whether or not to strip whitespace from values in the input file index : str, optional Defaults to 'sample_name'. The index to use in the loaded information Returns ------- DataFrame Pandas dataframe with the loaded information Raises ------ ValueError Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. If there's a value in one of the reserved columns that cannot be cast to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. QiitaDBError When non UTF-8 characters are found in the file. QiitaDBDuplicateHeaderError If duplicate columns are present in the template Notes ----- The index attribute of the DataFrame will be forced to be 'sample_name' and will be cast to a string. Additionally rows that start with a '\t' character will be ignored and columns that are empty will be removed. Empty sample names will be removed from the DataFrame. The following table describes the data type per column that will be enforced in `fn`. Column names are case-insensitive but will be lowercased on addition to the database. +-----------------------+--------------+ | Column Name | Python Type | +=======================+==============+ | sample_name | str | +-----------------------+--------------+ | #SampleID | str | +-----------------------+--------------+ | physical_location | str | +-----------------------+--------------+ | has_physical_specimen | bool | +-----------------------+--------------+ | has_extracted_data | bool | +-----------------------+--------------+ | sample_type | str | +-----------------------+--------------+ | host_subject_id | str | +-----------------------+--------------+ | description | str | +-----------------------+--------------+ | latitude | float | +-----------------------+--------------+ | longitude | float | +-----------------------+--------------+ """ # Load in file lines holdfile = None with open_file(fn, mode='U') as f: holdfile = f.readlines() if not holdfile: raise ValueError('Empty file passed!') # Strip all values in the cells in the input file, if requested if strip_whitespace: for pos, line in enumerate(holdfile): holdfile[pos] = '\t'.join( d.strip(" \r\x0b\x0c") for d in line.split('\t')) # get and clean the controlled columns cols = holdfile[0].split('\t') controlled_cols = {'sample_name'} controlled_cols.update(qdb.metadata_template.constants.CONTROLLED_COLS) holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c for c in cols) if index == "#SampleID": # We're going to parse a QIIME mapping file. We are going to first # parse it with the QIIME function so we can remove the comments # easily and make sure that QIIME will accept this as a mapping file data, headers, comments = _parse_mapping_file(holdfile) holdfile = ["%s\n" % '\t'.join(d) for d in data] holdfile.insert(0, "%s\n" % '\t'.join(headers)) # The QIIME parser fixes the index and removes the # index = 'SampleID' # index_col: # is set as False, otherwise it is cast as a float and we want a string # keep_default: # is set as False, to avoid inferring empty/NA values with the defaults # that Pandas has. # na_values: # the values that should be considered as empty # true_values: # the values that should be considered "True" for boolean columns # false_values: # the values that should be considered "False" for boolean columns # converters: # ensure that sample names are not converted into any other types but # strings and remove any trailing spaces. Don't let pandas try to guess # the dtype of the other columns, force them to be a str. # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. try: template = pd.read_csv( StringIO(''.join(holdfile)), sep='\t', encoding='utf-8', infer_datetime_format=True, keep_default_na=False, na_values=qdb.metadata_template.constants.NA_VALUES, true_values=qdb.metadata_template.constants.TRUE_VALUES, false_values=qdb.metadata_template.constants.FALSE_VALUES, parse_dates=True, index_col=False, comment='\t', mangle_dupe_cols=False, converters={ index: lambda x: str(x).strip(), # required sample template information 'physical_location': str, 'sample_type': str, # collection_timestamp is not added here 'host_subject_id': str, 'description': str, # common prep template information 'center_name': str, 'center_projct_name': str }) except UnicodeDecodeError: # Find row number and col number for utf-8 encoding errors headers = holdfile[0].strip().split('\t') errors = defaultdict(list) for row, line in enumerate(holdfile, 1): for col, cell in enumerate(line.split('\t')): try: cell.encode('utf-8') except UnicodeError: errors[headers[col]].append(row) lines = [ '%s: row(s) %s' % (header, ', '.join(map(str, rows))) for header, rows in viewitems(errors) ] raise qdb.exceptions.QiitaDBError( 'Non UTF-8 characters found in columns:\n' + '\n'.join(lines)) # Check that we don't have duplicate columns if len(set(template.columns)) != len(template.columns): raise qdb.exceptions.QiitaDBDuplicateHeaderError( find_duplicates(template.columns)) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error columns_to_dtype = [ (['latitude', 'longitude'], (np.int, np.float), 'integer or decimal'), (['has_physical_specimen', 'has_extracted_data'], np.bool_, 'boolean') ] for columns, c_dtype, english_desc in columns_to_dtype: for n in columns: if n in template.columns and not all( [isinstance(val, c_dtype) for val in template[n]]): raise qdb.exceptions.QiitaDBColumnError( "The '%s' column includes values that cannot be cast " "into a %s value " % (n, english_desc)) initial_columns = set(template.columns) if index not in template.columns: raise qdb.exceptions.QiitaDBColumnError( "The '%s' column is missing from your template, this file cannot " "be parsed." % index) # remove rows that have no sample identifier but that may have other data # in the rest of the columns template.dropna(subset=[index], how='all', inplace=True) # set the sample name as the index template.set_index(index, inplace=True) # it is not uncommon to find templates that have empty columns template.dropna(how='all', axis=1, inplace=True) initial_columns.remove(index) dropped_cols = initial_columns - set(template.columns) if dropped_cols: warnings.warn( 'The following column(s) were removed from the template because ' 'all their values are empty: %s' % ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning) # Pandas represents data with np.nan rather than Nones, change it to None # because psycopg2 knows that a None is a Null in SQL, while it doesn't # know what to do with NaN template = template.where((pd.notnull(template)), None) return template
def from_file(cls, ord_res_f): r"""Load ordination results from text file. Creates a `OrdinationResults` instance from serialized results stored as text. `ord_res_f` must be a file-like object containing text. The ord_res_f format should look like:: Eigvals<tab>2 0.096<tab>0.040 Proportion explained<tab>2 0.512<tab>0.488 Species<tab>3<tab>2 Species1<tab>0.408<tab>0.069 Species2<tab>-0.115<tab>-0.299 Species3<tab>-0.309<tab>0.187 Site<tab>3<tab>2 Site1<tab>-0.848<tab>0.882 Site2<tab>-0.220<tab>-1.344 Site3<tab>1.666<tab>0.470 Biplot<tab>4<tab>3 0.422<tab>-0.559<tab>-0.713 0.988<tab>0.150<tab>-0.011 -0.556<tab>0.817<tab>0.147 -0.404<tab>-0.905<tab>-0.127 Site constraints<tab>3<tab>2 Site1<tab>-0.848<tab>0.882 Site2<tab>-0.220<tab>-1.344 Site3<tab>1.666<tab>0.470 If a given result attribute is not present (e.g. Biplot), it should be still defined and declare its dimensions as 0:: Biplot<tab>0<tab>0 Parameters ---------- ord_res_f : iterable of str or str Iterable of strings (e.g., open file handle, file-like object, list of strings, etc.) or a file path (a string) containing the serialized ordination results. Returns ------- OrdinationResults Instance of type `cls` containing the parsed contents of `ord_res_f`. Raises ------ ValueError if the shapes of the different sections of the file are not consistent FileFormatError if the format of the file is not recognized Examples -------- Assume we have the following tab-delimited text file storing the ordination results:: Eigvals\t2 0.0961330159181\t0.0409418140138 Proportion explained\t0 Species\t3\t2 Species1\t0.408869425742\t0.0695518116298 Species2\t-0.1153860437\t-0.299767683538 Species3\t-0.309967102571\t0.187391917117 Site\t3\t2 Site1\t-0.848956053187\t0.882764759014 Site2\t-0.220458650578\t-1.34482000302 Site3\t1.66697179591\t0.470324389808 Biplot\t0\t0 Site constraints\t0\t0 Load the ordination results from the file: >>> from StringIO import StringIO >>> from skbio.stats.ordination import OrdinationResults >>> or_f = StringIO("Eigvals\t2\n" ... "0.0961330159181\t0.0409418140138\n" ... "\n" ... "Proportion explained\t0\n" ... "\n" ... "Species\t3\t2\n" ... "Species1\t0.408869425742\t0.0695518116298\n" ... "Species2\t-0.1153860437\t-0.299767683538\n" ... "Species3\t-0.309967102571\t0.187391917117\n" ... "\n" ... "Site\t3\t2\n" ... "Site1\t-0.848956053187\t0.882764759014\n" ... "Site2\t-0.220458650578\t-1.34482000302\n" ... "Site3\t1.66697179591\t0.470324389808\n" ... "\n" ... "Biplot\t0\t0\n" ... "\n" ... "Site constraints\t0\t0\n") >>> ord_res = OrdinationResults.from_file(or_f) """ with open_file(ord_res_f, 'U') as fd: orf = iter(fd) # Starting at line 0, we should find the eigvals eigvals = cls._parse_eigvals(orf) # The next line should be an empty line cls._check_empty_line(orf) # Now we should find the proportion explained section prop_expl = cls._parse_proportion_explained(orf) if prop_expl is not None: if len(prop_expl) != len(eigvals): raise ValueError( 'There should be as many proportion explained' ' values as eigvals: %d != %d' % (len(prop_expl), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the species section species, species_ids = cls._parse_coords(orf, 'Species') if species is not None: if len(species[0]) != len(eigvals): raise ValueError( 'There should be as many coordinates per' ' species as eigvals: %d != %d' % (len(species[0]), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the site section site, site_ids = cls._parse_coords(orf, 'Site') if site is not None: if len(site[0]) != len(eigvals): raise ValueError( 'There should be as many coordinates per' ' site as eigvals: %d != %d' % (len(site[0]), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the biplot section biplot = cls._parse_biplot(orf) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the site constraints section cons, cons_ids = cls._parse_coords(orf, 'Site constraints') if cons_ids is not None and site_ids is not None: if cons_ids != site_ids: raise ValueError( 'Site constraints ids and site ids must be' ' equal: %s != %s' % (cons_ids, site_ids)) return cls(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=cons, proportion_explained=prop_expl, species_ids=species_ids, site_ids=site_ids)
def test_BytesIO(self): """BytesIO (useful e.g. for testing) slips through.""" f = BytesIO(b"File contents") with open_file(f) as fh: self.assertTrue(fh is f)
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'): """Load a sample/prep template or a QIIME mapping file into a data frame Parameters ---------- fn : str or file-like object filename of the template to load, or an already open template file strip_whitespace : bool, optional Defaults to True. Whether or not to strip whitespace from values in the input file index : str, optional Defaults to 'sample_name'. The index to use in the loaded information Returns ------- DataFrame Pandas dataframe with the loaded information Raises ------ ValueError Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. If there's a value in one of the reserved columns that cannot be cast to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. QiitaDBError When non UTF-8 characters are found in the file. QiitaDBDuplicateHeaderError If duplicate columns are present in the template Notes ----- The index attribute of the DataFrame will be forced to be 'sample_name' and will be cast to a string. Additionally rows that start with a '\t' character will be ignored and columns that are empty will be removed. Empty sample names will be removed from the DataFrame. The following table describes the data type per column that will be enforced in `fn`. Column names are case-insensitive but will be lowercased on addition to the database. +-----------------------+--------------+ | Column Name | Python Type | +=======================+==============+ | sample_name | str | +-----------------------+--------------+ | #SampleID | str | +-----------------------+--------------+ | physical_location | str | +-----------------------+--------------+ | has_physical_specimen | bool | +-----------------------+--------------+ | has_extracted_data | bool | +-----------------------+--------------+ | sample_type | str | +-----------------------+--------------+ | host_subject_id | str | +-----------------------+--------------+ | description | str | +-----------------------+--------------+ | latitude | float | +-----------------------+--------------+ | longitude | float | +-----------------------+--------------+ """ # Load in file lines holdfile = None with open_file(fn, mode='U') as f: holdfile = f.readlines() if not holdfile: raise ValueError('Empty file passed!') # Strip all values in the cells in the input file, if requested if strip_whitespace: for pos, line in enumerate(holdfile): holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c") for d in line.split('\t')) # get and clean the controlled columns cols = holdfile[0].split('\t') controlled_cols = {'sample_name'} controlled_cols.update(CONTROLLED_COLS) holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c for c in cols) if index == "#SampleID": # We're going to parse a QIIME mapping file. We are going to first # parse it with the QIIME function so we can remove the comments # easily and make sure that QIIME will accept this as a mapping file data, headers, comments = _parse_mapping_file(holdfile) holdfile = ["%s\n" % '\t'.join(d) for d in data] holdfile.insert(0, "%s\n" % '\t'.join(headers)) # The QIIME parser fixes the index and removes the # index = 'SampleID' # index_col: # is set as False, otherwise it is cast as a float and we want a string # keep_default: # is set as False, to avoid inferring empty/NA values with the defaults # that Pandas has. # na_values: # the values that should be considered as empty # true_values: # the values that should be considered "True" for boolean columns # false_values: # the values that should be considered "False" for boolean columns # converters: # ensure that sample names are not converted into any other types but # strings and remove any trailing spaces. Don't let pandas try to guess # the dtype of the other columns, force them to be a str. # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. try: template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', encoding='utf-8', infer_datetime_format=True, keep_default_na=False, na_values=NA_VALUES, true_values=TRUE_VALUES, false_values=FALSE_VALUES, parse_dates=True, index_col=False, comment='\t', mangle_dupe_cols=False, converters={ index: lambda x: str(x).strip(), # required sample template information 'physical_location': str, 'sample_type': str, # collection_timestamp is not added here 'host_subject_id': str, 'description': str, # common prep template information 'center_name': str, 'center_projct_name': str}) except UnicodeDecodeError: # Find row number and col number for utf-8 encoding errors headers = holdfile[0].strip().split('\t') errors = defaultdict(list) for row, line in enumerate(holdfile, 1): for col, cell in enumerate(line.split('\t')): try: cell.encode('utf-8') except UnicodeError: errors[headers[col]].append(row) lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows))) for header, rows in viewitems(errors)] raise QiitaDBError('Non UTF-8 characters found in columns:\n' + '\n'.join(lines)) # Check that we don't have duplicate columns if len(set(template.columns)) != len(template.columns): raise QiitaDBDuplicateHeaderError(find_duplicates(template.columns)) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error columns_to_dtype = [(['latitude', 'longitude'], (np.int, np.float), 'integer or decimal'), (['has_physical_specimen', 'has_extracted_data'], np.bool_, 'boolean')] for columns, c_dtype, english_desc in columns_to_dtype: for n in columns: if n in template.columns and not all([isinstance(val, c_dtype) for val in template[n]]): raise QiitaDBColumnError("The '%s' column includes values " "that cannot be cast into a %s " "value " % (n, english_desc)) initial_columns = set(template.columns) if index not in template.columns: raise QiitaDBColumnError("The '%s' column is missing from " "your template, this file cannot be parsed." % index) # remove rows that have no sample identifier but that may have other data # in the rest of the columns template.dropna(subset=[index], how='all', inplace=True) # set the sample name as the index template.set_index(index, inplace=True) # it is not uncommon to find templates that have empty columns template.dropna(how='all', axis=1, inplace=True) initial_columns.remove(index) dropped_cols = initial_columns - set(template.columns) if dropped_cols: warnings.warn('The following column(s) were removed from the template ' 'because all their values are empty: ' '%s' % ', '.join(dropped_cols), QiitaDBWarning) # Pandas represents data with np.nan rather than Nones, change it to None # because psycopg2 knows that a None is a Null in SQL, while it doesn't # know what to do with NaN template = template.where((pd.notnull(template)), None) return template
def to_file(self, out_f): """Save the ordination results to file in text format. Parameters ---------- out_f : file-like object or filename File-like object to write serialized data to, or name of file. If it's a file-like object, it must have a ``write`` method, and it won't be closed. Else, it is opened and closed after writing. See Also -------- from_file """ with open_file(out_f, 'w') as out_f: # Write eigvals out_f.write("Eigvals\t%d\n" % self.eigvals.shape) out_f.write("%s\n\n" % '\t'.join(np.asarray(self.eigvals, dtype=np.str))) # Write proportion explained if self.proportion_explained is None: out_f.write("Proportion explained\t0\n\n") else: out_f.write("Proportion explained\t%d\n" % self.proportion_explained.shape) out_f.write("%s\n\n" % '\t'.join( np.asarray(self.proportion_explained, dtype=np.str))) # Write species if self.species is None: out_f.write("Species\t0\t0\n\n") else: out_f.write("Species\t%d\t%d\n" % self.species.shape) for id_, vals in zip(self.species_ids, self.species): out_f.write("%s\t%s\n" % (id_, '\t'.join(np.asarray(vals, dtype=np.str)))) out_f.write("\n") # Write site if self.site is None: out_f.write("Site\t0\t0\n\n") else: out_f.write("Site\t%d\t%d\n" % self.site.shape) for id_, vals in zip(self.site_ids, self.site): out_f.write("%s\t%s\n" % (id_, '\t'.join( np.asarray(vals, dtype=np.str)))) out_f.write("\n") # Write biplot if self.biplot is None: out_f.write("Biplot\t0\t0\n\n") else: out_f.write("Biplot\t%d\t%d\n" % self.biplot.shape) for vals in self.biplot: out_f.write("%s\n" % '\t'.join( np.asarray(vals, dtype=np.str))) out_f.write("\n") # Write site-constraints if self.site_constraints is None: out_f.write("Site constraints\t0\t0\n") else: out_f.write("Site constraints\t%d\t%d\n" % self.site_constraints.shape) for id_, vals in zip(self.site_ids, self.site_constraints): out_f.write("%s\t%s\n" % (id_, '\t'.join( np.asarray(vals, dtype=np.str))))
def from_file(cls, dm_f, delimiter='\t'): """Load dissimilarity matrix from a delimited text file or file path. Creates a `DissimilarityMatrix` instance from a serialized dissimilarity matrix stored as delimited text. `dm_f` can be a file-like or a file path object containing delimited text. The first line (header) must contain the IDs of each object. The subsequent lines must contain an ID followed by each dissimilarity (float) between the current object and all other objects, where the order of objects is determined by the header line. For example, a 2x2 dissimilarity matrix with IDs ``'a'`` and ``'b'`` might look like:: <del>a<del>b a<del>0.0<del>1.0 b<del>1.0<del>0.0 where ``<del>`` is the delimiter between elements. Parameters ---------- dm_f : iterable of str or str Iterable of strings (e.g., open file handle, file-like object, list of strings, etc.) or a file path (a string) containing a serialized dissimilarity matrix. delimiter : str, optional String delimiting elements in `dm_f`. Returns ------- DissimilarityMatrix Instance of type `cls` containing the parsed contents of `dm_f`. Notes ----- Whitespace-only lines can occur anywhere throughout the "file" and are ignored. Lines starting with ``#`` are treated as comments and ignored. These comments can only occur *before* the ID header. IDs will have any leading/trailing whitespace removed when they are parsed. .. note:: File-like objects passed to this method will not be closed upon the completion of the parsing, it is responsibility of the owner of the object to perform this operation. """ # We aren't using np.loadtxt because it uses *way* too much memory # (e.g, a 2GB matrix eats up 10GB, which then isn't freed after parsing # has finished). See: # http://mail.scipy.org/pipermail/numpy-tickets/2012-August/006749.html with open_file(dm_f, 'U') as dm_f: # We use iter() as we want to take a single pass over the # iterable and maintain our current position after finding # the header (mainly necessary for something like a list # of strings). dm_f = iter(dm_f) # Strategy: # - find the header # - initialize an empty ndarray # - for each row of data in the input file: # - populate the corresponding row in the ndarray with floats ids = cls._parse_ids(dm_f, delimiter) num_ids = len(ids) data = np.empty((num_ids, num_ids), dtype=np.float64) # curr_row_idx keeps track of the row index within the data matrix. # We're not using enumerate() because there may be # empty/whitespace-only lines throughout the data matrix. We want # to ignore those and only count the actual rows of data. curr_row_idx = 0 for line in dm_f: line = line.strip() if not line: continue elif curr_row_idx >= num_ids: # We've hit a nonempty line after we already filled the # data matrix. Raise an error because we shouldn't ignore # extra data. raise DissimilarityMatrixFormatError( "Encountered extra rows without corresponding IDs in" " the header.") tokens = line.split(delimiter) # -1 because the first element contains the current ID. if len(tokens) - 1 != num_ids: raise DissimilarityMatrixFormatError( "There are %d values in row number %d, which is not" " equal to the number of IDs in the header (%d)." % (len(tokens) - 1, curr_row_idx + 1, num_ids)) curr_id = tokens[0].strip() expected_id = ids[curr_row_idx] if curr_id == expected_id: data[curr_row_idx, :] = np.asarray(tokens[1:], dtype=float) else: raise DissimilarityMatrixFormatError( "Encountered mismatched IDs while parsing the " "dissimilarity matrix file. Found '%s' but expected " "'%s'. Please ensure that the IDs match between the " "dissimilarity matrix header (first row) and the row " "labels (first column)." % (curr_id, expected_id)) curr_row_idx += 1 if curr_row_idx != num_ids: raise DissimilarityMatrixFormatError( "Expected %d row(s) of data, but found %d." % (num_ids, curr_row_idx)) return cls(data, ids)
def load_template_to_dataframe(fn, strip_whitespace=True): """Load a sample or a prep template into a data frame Parameters ---------- fn : str or file-like object filename of the template to load, or an already open template file strip_whitespace : bool, optional Defaults to True. Whether or not to strip whitespace from values in the input file Returns ------- DataFrame Pandas dataframe with the loaded information Raises ------ ValueError Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. If there's a value in one of the reserved columns that cannot be cast to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. Notes ----- The index attribute of the DataFrame will be forced to be 'sample_name' and will be cast to a string. Additionally rows that start with a '\t' character will be ignored and columns that are empty will be removed. Empty sample names will be removed from the DataFrame. The following table describes the data type per column that will be enforced in `fn`. Column names are case-insensitive but will be lowercased on addition to the database. +-----------------------+--------------+ | Column Name | Python Type | +=======================+==============+ | sample_name | str | +-----------------------+--------------+ | physical_location | str | +-----------------------+--------------+ | has_physical_specimen | bool | +-----------------------+--------------+ | has_extracted_data | bool | +-----------------------+--------------+ | sample_type | str | +-----------------------+--------------+ | host_subject_id | str | +-----------------------+--------------+ | description | str | +-----------------------+--------------+ | latitude | float | +-----------------------+--------------+ | longitude | float | +-----------------------+--------------+ """ # Load in file lines holdfile = None with open_file(fn) as f: holdfile = f.readlines() if not holdfile: raise ValueError('Empty file passed!') # Strip all values in the cells in the input file, if requested if strip_whitespace: for pos, line in enumerate(holdfile): holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c") for d in line.split('\t')) # get and clean the controlled columns cols = holdfile[0].split('\t') controlled_cols = {'sample_name'} controlled_cols.update(CONTROLLED_COLS) holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c for c in cols) # index_col: # is set as False, otherwise it is cast as a float and we want a string # keep_default: # is set as False, to avoid inferring empty/NA values with the defaults # that Pandas has. # na_values: # the values that should be considered as empty, in this case only empty # strings. # converters: # ensure that sample names are not converted into any other types but # strings and remove any trailing spaces. Don't let pandas try to guess # the dtype of the other columns, force them to be a str. # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', infer_datetime_format=True, keep_default_na=False, na_values=[''], parse_dates=True, index_col=False, comment='\t', mangle_dupe_cols=False, converters={ 'sample_name': lambda x: str(x).strip(), # required_sample_info 'physical_location': str, 'sample_type': str, # collection_timestamp is not added here 'host_subject_id': str, 'description': str, # common_prep_info 'center_name': str, 'center_projct_name': str}) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error columns_to_dtype = [(['latitude', 'longitude'], (np.int, np.float), 'integer or decimal'), (['has_physical_specimen', 'has_extracted_data'], np.bool_, 'boolean')] for columns, c_dtype, english_desc in columns_to_dtype: for n in columns: if n in template.columns and not all([isinstance(val, c_dtype) for val in template[n]]): raise QiitaDBColumnError("The '%s' column includes values " "that cannot be cast into a %s " "value " % (n, english_desc)) initial_columns = set(template.columns) if 'sample_name' not in template.columns: raise QiitaDBColumnError("The 'sample_name' column is missing from " "your template, this file cannot be parsed.") # remove rows that have no sample identifier but that may have other data # in the rest of the columns template.dropna(subset=['sample_name'], how='all', inplace=True) # set the sample name as the index template.set_index('sample_name', inplace=True) # it is not uncommon to find templates that have empty columns template.dropna(how='all', axis=1, inplace=True) initial_columns.remove('sample_name') dropped_cols = initial_columns - set(template.columns) if dropped_cols: warnings.warn('The following column(s) were removed from the template ' 'because all their values are empty: ' '%s' % ', '.join(dropped_cols), QiitaDBWarning) return template
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33): r"""yields label, seq, and qual from a fastq file. Parameters ---------- data : open file object or str An open fastq file (opened in binary mode) or a path to it. strict : bool, optional Defaults to ``False``. If strict is true a FastqParse error will be raised if the seq and qual labels dont' match. enforce_qual_range : bool, optional Defaults to ``True``. If ``True``, an exception will be raised if a quality score outside the range [0, 62] is detected phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers Returns ------- label, seq, qual : (str, bytes, np.array) yields the label, sequence and quality for each entry Examples -------- Assume we have a fastq formatted file with the following contents:: @seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG + ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF @seq2 TATGTATATATAACATATACATATATACATACATA + ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb We can use the following code: >>> from StringIO import StringIO >>> from skbio.parse.sequences import parse_fastq >>> fastq_f = StringIO('@seq1\n' ... 'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n' ... '+\n' ... '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n' ... '@seq2\n' ... 'TATGTATATATAACATATACATATATACATACATA\n' ... '+\n' ... ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n') >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64): ... print(label) ... print(seq) ... print(qual) seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35 35 25 20 28 20 28 25 28 23 6] seq2 TATGTATATATAACATATACATATATACATACATA [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32 35 32 28 33 20 32 32 34 34 34] """ if phred_offset == 33: phred_f = ascii_to_phred33 elif phred_offset == 64: phred_f = ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) with open_file(data, 'rb') as data: iters = [iter(data)] * 4 for seqid, seq, qualid, qual in zip_longest(*iters): seqid = seqid.strip() # If the file simply ended in a blankline, do not error if seqid is '': continue # Error if an incomplete record is found # Note: seqid cannot be None, because if all 4 values were None, # then the loop condition would be false, and we could not have # gotten to this point if seq is None or qualid is None or qual is None: raise FastqParseError("Incomplete FASTQ record found at end " "of file") seq = seq.strip() qualid = qualid.strip() qual = qual.strip() seqid = _drop_id_marker(seqid) try: seq = str(seq.decode("utf-8")) except AttributeError: pass qualid = _drop_id_marker(qualid) if strict: if seqid != qualid: raise FastqParseError('ID mismatch: {} != {}'.format( seqid, qualid)) # bounds based on illumina limits, see: # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html qual = phred_f(qual) if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()): raise FastqParseError("Failed qual conversion for seq id: %s. " "This may be because you passed an " "incorrect value for phred_offset." % seqid) yield (seqid, seq, qual)