示例#1
0
    def to_file(self, out_f, delimiter='\t'):
        """Save the dissimilarity matrix to file in delimited text format.

        Parameters
        ----------
        out_f : file-like object or filename
            File-like object to write serialized data to, or name of
            file. If it's a file-like object, it must have a ``write``
            method, and it won't be closed. Else, it is opened and
            closed after writing.
        delimiter : str, optional
            Delimiter used to separate elements in output format.

        See Also
        --------
        from_file

        """
        with open_file(out_f, 'w') as out_f:
            formatted_ids = self._format_ids(delimiter)
            out_f.write(formatted_ids)
            out_f.write('\n')

            for id_, vals in zip(self.ids, self.data):
                out_f.write(id_)
                out_f.write(delimiter)
                out_f.write(delimiter.join(np.asarray(vals, dtype=np.str)))
                out_f.write('\n')
示例#2
0
 def test_filehandle(self):
     """Filehandles slip through untouched"""
     with tempfile.TemporaryFile('r') as fh:
         with open_file(fh) as ffh:
             self.assertTrue(fh is ffh)
         # And it doesn't close the file-handle
         self.assertFalse(fh.closed)
示例#3
0
 def test_file_closed(self):
     """File gets closed in decorator"""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     with open_file(filepath) as fh:
         pass
     self.assertTrue(fh.closed)
示例#4
0
def looks_like_qiime_mapping_file(fp):
    """Checks if the file looks like a QIIME mapping file

    Parameters
    ----------
    fp : str or file-like object
        filepath to check if it looks like a QIIME mapping file

    Returns
    -------
    bool
        True if fp looks like a QIIME mapping file, false otherwise.


    Notes
    -----
    This is not doing a validation of the QIIME mapping file. It simply checks
    the first line in the file and it returns true if the line starts with
    '#SampleID', since a sample/prep template will start with 'sample_name' or
    some other different column.
    """
    first_line = None
    with open_file(fp, mode='U') as f:
        first_line = f.readline()
    if not first_line:
        return False

    first_col = first_line.split()[0]
    return first_col == '#SampleID'
示例#5
0
 def test_file_closed(self):
     """File gets closed in decorator"""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     with open_file(filepath) as fh:
         pass
     self.assertTrue(fh.closed)
示例#6
0
 def test_filehandle(self):
     """Filehandles slip through untouched"""
     with tempfile.TemporaryFile('r') as fh:
         with open_file(fh) as ffh:
             self.assertTrue(fh is ffh)
         # And it doesn't close the file-handle
         self.assertFalse(fh.closed)
示例#7
0
def looks_like_qiime_mapping_file(fp):
    """Checks if the file looks like a QIIME mapping file

    Parameters
    ----------
    fp : str or file-like object
        filepath to check if it looks like a QIIME mapping file

    Returns
    -------
    bool
        True if fp looks like a QIIME mapping file, false otherwise.


    Notes
    -----
    This is not doing a validation of the QIIME mapping file. It simply checks
    the first line in the file and it returns true if the line starts with
    '#SampleID', since a sample/prep template will start with 'sample_name' or
    some other different column.
    """
    first_line = None
    with open_file(fp, mode='U') as f:
        first_line = f.readline()
    if not first_line:
        return False

    first_col = first_line.split()[0]
    return first_col == '#SampleID'
def load_mf(fn):
    from skbio.io.util import open_file
    from emperor.qiime_backports.parse import parse_mapping_file
    with open_file(fn) as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file
def load_mf(fn):
    from skbio.io.util import open_file
    from emperor.qiime_backports.parse import parse_mapping_file
    with open_file(fn) as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file
示例#10
0
 def test_filehandle(self):
     """Filehandles slip through untouched"""
     with tempfile.TemporaryFile('r') as fh:
         with tempfile.TemporaryFile('r') as fh2:
             with open_file([fh, fh2]) as fhs:
                 self.assertTrue(fh is fhs[0])
                 self.assertTrue(fh2 is fhs[1])
             # And it doesn't close the file-handle
             for fh in fhs:
                 self.assertFalse(fh.closed)
示例#11
0
 def test_filehandle(self):
     """Filehandles slip through untouched"""
     with tempfile.TemporaryFile('r') as fh:
         with tempfile.TemporaryFile('r') as fh2:
             with open_file([fh, fh2]) as fhs:
                 self.assertTrue(fh is fhs[0])
                 self.assertTrue(fh2 is fhs[1])
             # And it doesn't close the file-handle
             for fh in fhs:
                 self.assertFalse(fh.closed)
示例#12
0
 def test_file_closed_harder(self):
     """File gets closed in decorator, even if exceptions happen."""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     try:
         with open_file(filepath) as fh:
             raise TypeError
     except TypeError:
         self.assertTrue(fh.closed)
     else:
         # If we're here, no exceptions have been raised inside the
         # try clause, so the context manager swallowed them. No
         # good.
         raise Exception("`open_file` didn't propagate exceptions")
示例#13
0
 def test_file_closed_harder(self):
     """File gets closed in decorator, even if exceptions happen."""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     try:
         with open_file(filepath) as fh:
             raise TypeError
     except TypeError:
         self.assertTrue(fh.closed)
     else:
         # If we're here, no exceptions have been raised inside the
         # try clause, so the context manager swallowed them. No
         # good.
         raise Exception("`open_file` didn't propagate exceptions")
示例#14
0
文件: parse.py 项目: XMUCCF/qiime
def parse_items(fp):
    """Parse items from a file where each item is in a different line

    Parameters
    ----------
    fp : str/bytes/unicode string or file-like
        Filepath or file-like object to parse.

    Returns
    -------
    list
        List of the items parsed from the file
    """
    with open_file(fp, 'U') as f:
        items = f.read().strip('\n').split('\n')

    if items == ['']:
        items = []
    return items
示例#15
0
def parse_items(fp):
    """Parse items from a file where each item is in a different line

    Parameters
    ----------
    fp : str/bytes/unicode string or file-like
        Filepath or file-like object to parse.

    Returns
    -------
    list
        List of the items parsed from the file
    """
    with open_file(fp, 'U') as f:
        items = f.read().strip('\n').split('\n')

    if items == ['']:
        items = []
    return items
示例#16
0
    def parser(lines):
        with open_file(lines) as lines:
            curr = []
            for l in lines:
                try:
                    l = str(l.decode("utf-8"))
                except AttributeError:
                    pass

                if constructor is not None:
                    line = constructor(l)
                else:
                    line = l
                if ignore(line):
                    continue
                # if we find the label, return the previous record
                if is_label_line(line):
                    if curr:
                        yield curr
                        curr = []
                curr.append(line)
            # don't forget to return the last record in the file
            if curr:
                yield curr
示例#17
0
    def parser(lines):
        with open_file(lines) as lines:
            curr = []
            for l in lines:
                try:
                    l = str(l.decode('utf-8'))
                except AttributeError:
                    pass

                if constructor is not None:
                    line = constructor(l)
                else:
                    line = l
                if ignore(line):
                    continue
                # if we find the label, return the previous record
                if is_label_line(line):
                    if curr:
                        yield curr
                        curr = []
                curr.append(line)
            # don't forget to return the last record in the file
            if curr:
                yield curr
示例#18
0
def load_template_to_dataframe(fn, index='sample_name'):
    """Load a sample/prep template or a QIIME mapping file into a data frame

    Parameters
    ----------
    fn : str or file-like object
        filename of the template to load, or an already open template file
    index : str, optional
        Defaults to 'sample_name'. The index to use in the loaded information

    Returns
    -------
    DataFrame
        Pandas dataframe with the loaded information

    Raises
    ------
    ValueError
        Empty file passed
    QiitaDBColumnError
        If the sample_name column is not present in the template.
    QiitaDBWarning
        When columns are dropped because they have no content for any sample.
    QiitaDBError
        When non UTF-8 characters are found in the file.
    QiitaDBDuplicateHeaderError
        If duplicate columns are present in the template

    Notes
    -----
    The index attribute of the DataFrame will be forced to be 'sample_name'
    and will be cast to a string. Additionally rows that start with a '\t'
    character will be ignored and columns that are empty will be removed. Empty
    sample names will be removed from the DataFrame.

    Column names are case-insensitive but will be lowercased on addition to
    the database

    Everything in the DataFrame will be read and managed as string
    """
    # Load in file lines
    holdfile = None
    with open_file(fn, mode='U') as f:
        holdfile = f.readlines()
    if not holdfile:
        raise ValueError('Empty file passed!')

    if index == "#SampleID":
        # We're going to parse a QIIME mapping file. We are going to first
        # parse it with the QIIME function so we can remove the comments
        # easily and make sure that QIIME will accept this as a mapping file
        data, headers, comments = _parse_mapping_file(holdfile)
        holdfile = ["%s\n" % '\t'.join(d) for d in data]
        holdfile.insert(0, "%s\n" % '\t'.join(headers))
        # The QIIME parser fixes the index and removes the #
        index = 'SampleID'

    # Strip all values in the cells in the input file
    for pos, line in enumerate(holdfile):
        cols = line.split('\t')
        if pos == 0 and index != 'SampleID':
            # get and clean the controlled columns
            ccols = {'sample_name'}
            ccols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
            newcols = [
                c.lower().strip() if c.lower().strip() in ccols
                else c.strip()
                for c in cols]

            # while we are here, let's check for duplicate columns headers
            if len(set(newcols)) != len(newcols):
                raise qdb.exceptions.QiitaDBDuplicateHeaderError(
                    find_duplicates(newcols))
        else:
            # .strip will remove odd chars, newlines, tabs and multiple
            # spaces but we need to read a new line at the end of the
            # line(+'\n')
            newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]

        holdfile[pos] = '\t'.join(newcols) + '\n'

    # index_col:
    #   is set as False, otherwise it is cast as a float and we want a string
    # keep_default:
    #   is set as False, to avoid inferring empty/NA values with the defaults
    #   that Pandas has.
    # comment:
    #   using the tab character as "comment" we remove rows that are
    #   constituted only by delimiters i. e. empty rows.
    try:
        template = pd.read_csv(
            StringIO(''.join(holdfile)),
            sep='\t',
            dtype=str,
            encoding='utf-8',
            infer_datetime_format=False,
            keep_default_na=False,
            index_col=False,
            comment='\t',
            converters={index: lambda x: str(x).strip()})
        # remove newlines and tabs from fields
        template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
                         regex=True, inplace=True)
    except UnicodeDecodeError:
        # Find row number and col number for utf-8 encoding errors
        headers = holdfile[0].strip().split('\t')
        errors = defaultdict(list)
        for row, line in enumerate(holdfile, 1):
            for col, cell in enumerate(line.split('\t')):
                try:
                    cell.encode('utf-8')
                except UnicodeError:
                    errors[headers[col]].append(row)
        lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows)))
                 for header, rows in viewitems(errors)]
        raise qdb.exceptions.QiitaDBError(
            'Non UTF-8 characters found in columns:\n' + '\n'.join(lines))

    initial_columns = set(template.columns)

    if index not in template.columns:
        raise qdb.exceptions.QiitaDBColumnError(
            "The '%s' column is missing from your template, this file cannot "
            "be parsed." % index)

    # remove rows that have no sample identifier but that may have other data
    # in the rest of the columns
    template.dropna(subset=[index], how='all', inplace=True)

    # set the sample name as the index
    template.set_index(index, inplace=True)

    # it is not uncommon to find templates that have empty columns so let's
    # find the columns that are all ''
    columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0))
    template.drop(template.columns[columns], axis=1, inplace=True)

    initial_columns.remove(index)
    dropped_cols = initial_columns - set(template.columns)
    if dropped_cols:
        warnings.warn(
            'The following column(s) were removed from the template because '
            'all their values are empty: %s'
            % ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning)

    # Pandas represents data with np.nan rather than Nones, change it to None
    # because psycopg2 knows that a None is a Null in SQL, while it doesn't
    # know what to do with NaN
    template = template.where((pd.notnull(template)), None)

    return template
示例#19
0
def parse_qseq(infile, phred_offset=33):
    r"""Generator of seq ids, seqs, quals and other records from a qseq file.

    Parameters
    ----------
    infile : open file object or str
        An open qseq file or a path to a qseq file.
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols
        to integers.

    Returns
    -------
    four-item tuple: (str, str, np.array(dtype=int), namedtuple)
        yields the sequence id, sequence, qual array and other record
        information for each entry.  The sequence ID format is:
        <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y>#
        <Index>/<Read number>.  The namedtuple attributes are:
        machine_name, run, lane, tile, x, y, index, read and filtered.

    Examples
    --------
    Assume we have a qseq-formatted file with the following contents::

        CRESSIA       242     1       2204    1453    1918    0       1
            .TTAATAAGAATGTCTGTTGTGGCTTAAAA  B[[[W][Y[Zccccccccc\cccac_____  1
        CRESSIA       242     1       2204    1490    1921    0       2
            ..GTAAAACCCATATATTGAAAACTACAAA  BWUTWcXVXXcccc_cccccccccc_cccc  1

    >>> from six import StringIO
    >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t'
    ...   '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n'
    ...                   'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t'
    ...   '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n'
    ... )

    We can parse this as follows:

    >>> from skbio import parse_qseq
    >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64):
    ...     print(seq_id)
    ...     print(seq)
    ...     print(qual[:10])
    ...     print(record.run)
    ...     print(record.lane)
    CRESSIA_242:1:2204:1453:1918#0/1
    .TTAATAAGAATGTCTGTTGTGGCTTAAAA
    [ 2 27 27 27 23 29 27 25 27 26]
    242
    1
    CRESSIA_242:1:2204:1490:1921#0/2
    ..GTAAAACCCATATATTGAAAACTACAAA
    [ 2 23 21 20 23 35 24 22 24 24]
    242
    1
    """
    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    # namedtuple to store all other record information
    Record = collections.namedtuple(
        'Record',
        ['machine_name',
         'run',
         'lane',
         'tile',
         'x',
         'y',
         'index',
         'read',
         'filtered'])

    with open_file(infile) as lines:
        for rec in lines:
            try:
                rec = str(rec.decode('utf-8'))
            except AttributeError:
                pass
            # parse record.
            try:
                (machine_name, run, lane, tile, x, y, index, read, seq, qual,
                 filtered) = rec.split()
            except ValueError:
                raise QseqParseError("Invalid QSEQ record found.")
            # sequence ID is formatted using the first eight items.
            seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % (
                machine_name, run, lane, tile, x, y, index, read)
            # qual string is converted to an array of ints.
            qual = phred_f(qual)
            # other items are returned as a namedtuple
            record = Record(
                machine_name=machine_name,
                run=int(run),
                lane=int(lane),
                tile=int(tile),
                x=int(x),
                y=int(y),
                index=int(index),
                read=int(read),
                filtered=bool(int(filtered)))

            yield seq_id, seq, qual, record
示例#20
0
def parse_qseq(infile, phred_offset=33):
    r"""Generator of seq ids, seqs, quals and other records from a qseq file.

    Parameters
    ----------
    infile : open file object or str
        An open qseq file or a path to a qseq file.
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols
        to integers.

    Returns
    -------
    four-item tuple: (str, str, np.array(dtype=int), namedtuple)
        yields the sequence id, sequence, qual array and other record
        information for each entry.  The sequence ID format is:
        <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y>#
        <Index>/<Read number>.  The namedtuple attributes are:
        machine_name, run, lane, tile, x, y, index, read and filtered.

    Examples
    --------
    Assume we have a qseq-formatted file with the following contents::

        CRESSIA       242     1       2204    1453    1918    0       1
            .TTAATAAGAATGTCTGTTGTGGCTTAAAA  B[[[W][Y[Zccccccccc\cccac_____  1
        CRESSIA       242     1       2204    1490    1921    0       2
            ..GTAAAACCCATATATTGAAAACTACAAA  BWUTWcXVXXcccc_cccccccccc_cccc  1

    >>> from future.utils.six import StringIO
    >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t'
    ...   '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n'
    ...                   'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t'
    ...   '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n'
    ... )

    We can parse this as follows:

    >>> from skbio import parse_qseq
    >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64):
    ...     print(seq_id)
    ...     print(seq)
    ...     print(qual[:10])
    ...     print(record.run)
    ...     print(record.lane)
    CRESSIA_242:1:2204:1453:1918#0/1
    .TTAATAAGAATGTCTGTTGTGGCTTAAAA
    [ 2 27 27 27 23 29 27 25 27 26]
    242
    1
    CRESSIA_242:1:2204:1490:1921#0/2
    ..GTAAAACCCATATATTGAAAACTACAAA
    [ 2 23 21 20 23 35 24 22 24 24]
    242
    1
    """
    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    # namedtuple to store all other record information
    Record = collections.namedtuple(
        'Record',
        ['machine_name',
         'run',
         'lane',
         'tile',
         'x',
         'y',
         'index',
         'read',
         'filtered'])

    with open_file(infile) as lines:
        for rec in lines:
            try:
                rec = str(rec.decode('utf-8'))
            except AttributeError:
                pass
            # parse record.
            try:
                (machine_name, run, lane, tile, x, y, index, read, seq, qual,
                 filtered) = rec.split()
            except ValueError:
                raise QseqParseError("Invalid QSEQ record found.")
            # sequence ID is formatted using the first eight items.
            seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % (
                machine_name, run, lane, tile, x, y, index, read)
            # qual string is converted to an array of ints.
            qual = phred_f(qual)
            # other items are returned as a namedtuple
            record = Record(
                machine_name=machine_name,
                run=int(run),
                lane=int(lane),
                tile=int(tile),
                x=int(x),
                y=int(y),
                index=int(index),
                read=int(read),
                filtered=bool(int(filtered)))

            yield seq_id, seq, qual, record
示例#21
0
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
    """Load a sample/prep template or a QIIME mapping file into a data frame

    Parameters
    ----------
    fn : str or file-like object
        filename of the template to load, or an already open template file
    strip_whitespace : bool, optional
        Defaults to True. Whether or not to strip whitespace from values in the
        input file
    index : str, optional
        Defaults to 'sample_name'. The index to use in the loaded information

    Returns
    -------
    DataFrame
        Pandas dataframe with the loaded information

    Raises
    ------
    ValueError
        Empty file passed
    QiitaDBColumnError
        If the sample_name column is not present in the template.
    QiitaDBWarning
        When columns are dropped because they have no content for any sample.
    QiitaDBError
        When non UTF-8 characters are found in the file.
    QiitaDBDuplicateHeaderError
        If duplicate columns are present in the template

    Notes
    -----
    The index attribute of the DataFrame will be forced to be 'sample_name'
    and will be cast to a string. Additionally rows that start with a '\t'
    character will be ignored and columns that are empty will be removed. Empty
    sample names will be removed from the DataFrame.

    Column names are case-insensitive but will be lowercased on addition to
    the database

    Everything in the DataFrame will be read and managed as string
    """
    # Load in file lines
    holdfile = None
    with open_file(fn, mode='U') as f:
        holdfile = f.readlines()
    if not holdfile:
        raise ValueError('Empty file passed!')

    # Strip all values in the cells in the input file, if requested
    if strip_whitespace:
        for pos, line in enumerate(holdfile):
            holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c")
                                      for d in line.split('\t'))

    # get and clean the controlled columns
    cols = holdfile[0].split('\t')
    controlled_cols = {'sample_name'}
    controlled_cols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
    holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
                            for c in cols)

    if index == "#SampleID":
        # We're going to parse a QIIME mapping file. We are going to first
        # parse it with the QIIME function so we can remove the comments
        # easily and make sure that QIIME will accept this as a mapping file
        data, headers, comments = _parse_mapping_file(holdfile)
        holdfile = ["%s\n" % '\t'.join(d) for d in data]
        holdfile.insert(0, "%s\n" % '\t'.join(headers))
        # The QIIME parser fixes the index and removes the #
        index = 'SampleID'

    # Check that we don't have duplicate columns
    col_names = [c.lower() for c in holdfile[0].strip().split('\t')]
    if len(set(col_names)) != len(col_names):
        raise qdb.exceptions.QiitaDBDuplicateHeaderError(
            find_duplicates(col_names))

    # index_col:
    #   is set as False, otherwise it is cast as a float and we want a string
    # keep_default:
    #   is set as False, to avoid inferring empty/NA values with the defaults
    #   that Pandas has.
    # comment:
    #   using the tab character as "comment" we remove rows that are
    #   constituted only by delimiters i. e. empty rows.
    try:
        template = pd.read_csv(
            StringIO(''.join(holdfile)),
            sep='\t',
            dtype=str,
            encoding='utf-8',
            infer_datetime_format=False,
            keep_default_na=False,
            index_col=False,
            comment='\t',
            converters={index: lambda x: str(x).strip()})
    except UnicodeDecodeError:
        # Find row number and col number for utf-8 encoding errors
        headers = holdfile[0].strip().split('\t')
        errors = defaultdict(list)
        for row, line in enumerate(holdfile, 1):
            for col, cell in enumerate(line.split('\t')):
                try:
                    cell.encode('utf-8')
                except UnicodeError:
                    errors[headers[col]].append(row)
        lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows)))
                 for header, rows in viewitems(errors)]
        raise qdb.exceptions.QiitaDBError(
            'Non UTF-8 characters found in columns:\n' + '\n'.join(lines))

    initial_columns = set(template.columns)

    if index not in template.columns:
        raise qdb.exceptions.QiitaDBColumnError(
            "The '%s' column is missing from your template, this file cannot "
            "be parsed." % index)

    # remove rows that have no sample identifier but that may have other data
    # in the rest of the columns
    template.dropna(subset=[index], how='all', inplace=True)

    # set the sample name as the index
    template.set_index(index, inplace=True)

    # it is not uncommon to find templates that have empty columns so let's
    # find the columns that are all ''
    columns = np.where(np.all(template.applymap(lambda x: x == ''), axis=0))
    template.drop(template.columns[columns], axis=1, inplace=True)

    initial_columns.remove(index)
    dropped_cols = initial_columns - set(template.columns)
    if dropped_cols:
        warnings.warn(
            'The following column(s) were removed from the template because '
            'all their values are empty: %s'
            % ', '.join(dropped_cols), qdb.exceptions.QiitaDBWarning)

    # Pandas represents data with np.nan rather than Nones, change it to None
    # because psycopg2 knows that a None is a Null in SQL, while it doesn't
    # know what to do with NaN
    template = template.where((pd.notnull(template)), None)

    return template
示例#22
0
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
    """Load a sample/prep template or a QIIME mapping file into a data frame

    Parameters
    ----------
    fn : str or file-like object
        filename of the template to load, or an already open template file
    strip_whitespace : bool, optional
        Defaults to True. Whether or not to strip whitespace from values in the
        input file
    index : str, optional
        Defaults to 'sample_name'. The index to use in the loaded information

    Returns
    -------
    DataFrame
        Pandas dataframe with the loaded information

    Raises
    ------
    ValueError
        Empty file passed
    QiitaDBColumnError
        If the sample_name column is not present in the template.
        If there's a value in one of the reserved columns that cannot be cast
        to the needed type.
    QiitaDBWarning
        When columns are dropped because they have no content for any sample.
    QiitaDBError
        When non UTF-8 characters are found in the file.
    QiitaDBDuplicateHeaderError
        If duplicate columns are present in the template

    Notes
    -----
    The index attribute of the DataFrame will be forced to be 'sample_name'
    and will be cast to a string. Additionally rows that start with a '\t'
    character will be ignored and columns that are empty will be removed. Empty
    sample names will be removed from the DataFrame.

    The following table describes the data type per column that will be
    enforced in `fn`. Column names are case-insensitive but will be lowercased
    on addition to the database.

    +-----------------------+--------------+
    |      Column Name      |  Python Type |
    +=======================+==============+
    |           sample_name |          str |
    +-----------------------+--------------+
    |             #SampleID |          str |
    +-----------------------+--------------+
    |     physical_location |          str |
    +-----------------------+--------------+
    | has_physical_specimen |         bool |
    +-----------------------+--------------+
    |    has_extracted_data |         bool |
    +-----------------------+--------------+
    |           sample_type |          str |
    +-----------------------+--------------+
    |       host_subject_id |          str |
    +-----------------------+--------------+
    |           description |          str |
    +-----------------------+--------------+
    |              latitude |        float |
    +-----------------------+--------------+
    |             longitude |        float |
    +-----------------------+--------------+
    """
    # Load in file lines
    holdfile = None
    with open_file(fn, mode='U') as f:
        holdfile = f.readlines()
    if not holdfile:
        raise ValueError('Empty file passed!')

    # Strip all values in the cells in the input file, if requested
    if strip_whitespace:
        for pos, line in enumerate(holdfile):
            holdfile[pos] = '\t'.join(
                d.strip(" \r\x0b\x0c") for d in line.split('\t'))

    # get and clean the controlled columns
    cols = holdfile[0].split('\t')
    controlled_cols = {'sample_name'}
    controlled_cols.update(qdb.metadata_template.constants.CONTROLLED_COLS)
    holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
                            for c in cols)

    if index == "#SampleID":
        # We're going to parse a QIIME mapping file. We are going to first
        # parse it with the QIIME function so we can remove the comments
        # easily and make sure that QIIME will accept this as a mapping file
        data, headers, comments = _parse_mapping_file(holdfile)
        holdfile = ["%s\n" % '\t'.join(d) for d in data]
        holdfile.insert(0, "%s\n" % '\t'.join(headers))
        # The QIIME parser fixes the index and removes the #
        index = 'SampleID'

    # index_col:
    #   is set as False, otherwise it is cast as a float and we want a string
    # keep_default:
    #   is set as False, to avoid inferring empty/NA values with the defaults
    #   that Pandas has.
    # na_values:
    #   the values that should be considered as empty
    # true_values:
    #   the values that should be considered "True" for boolean columns
    # false_values:
    #   the values that should be considered "False" for boolean columns
    # converters:
    #   ensure that sample names are not converted into any other types but
    #   strings and remove any trailing spaces. Don't let pandas try to guess
    #   the dtype of the other columns, force them to be a str.
    # comment:
    #   using the tab character as "comment" we remove rows that are
    #   constituted only by delimiters i. e. empty rows.
    try:
        template = pd.read_csv(
            StringIO(''.join(holdfile)),
            sep='\t',
            encoding='utf-8',
            infer_datetime_format=True,
            keep_default_na=False,
            na_values=qdb.metadata_template.constants.NA_VALUES,
            true_values=qdb.metadata_template.constants.TRUE_VALUES,
            false_values=qdb.metadata_template.constants.FALSE_VALUES,
            parse_dates=True,
            index_col=False,
            comment='\t',
            mangle_dupe_cols=False,
            converters={
                index: lambda x: str(x).strip(),
                # required sample template information
                'physical_location': str,
                'sample_type': str,
                # collection_timestamp is not added here
                'host_subject_id': str,
                'description': str,
                # common prep template information
                'center_name': str,
                'center_projct_name': str
            })
    except UnicodeDecodeError:
        # Find row number and col number for utf-8 encoding errors
        headers = holdfile[0].strip().split('\t')
        errors = defaultdict(list)
        for row, line in enumerate(holdfile, 1):
            for col, cell in enumerate(line.split('\t')):
                try:
                    cell.encode('utf-8')
                except UnicodeError:
                    errors[headers[col]].append(row)
        lines = [
            '%s: row(s) %s' % (header, ', '.join(map(str, rows)))
            for header, rows in viewitems(errors)
        ]
        raise qdb.exceptions.QiitaDBError(
            'Non UTF-8 characters found in columns:\n' + '\n'.join(lines))

    # Check that we don't have duplicate columns
    if len(set(template.columns)) != len(template.columns):
        raise qdb.exceptions.QiitaDBDuplicateHeaderError(
            find_duplicates(template.columns))

    # let pandas infer the dtypes of these columns, if the inference is
    # not correct, then we have to raise an error
    columns_to_dtype = [
        (['latitude', 'longitude'], (np.int, np.float), 'integer or decimal'),
        (['has_physical_specimen', 'has_extracted_data'], np.bool_, 'boolean')
    ]
    for columns, c_dtype, english_desc in columns_to_dtype:
        for n in columns:
            if n in template.columns and not all(
                [isinstance(val, c_dtype) for val in template[n]]):
                raise qdb.exceptions.QiitaDBColumnError(
                    "The '%s' column includes values that cannot be cast "
                    "into a %s value " % (n, english_desc))

    initial_columns = set(template.columns)

    if index not in template.columns:
        raise qdb.exceptions.QiitaDBColumnError(
            "The '%s' column is missing from your template, this file cannot "
            "be parsed." % index)

    # remove rows that have no sample identifier but that may have other data
    # in the rest of the columns
    template.dropna(subset=[index], how='all', inplace=True)

    # set the sample name as the index
    template.set_index(index, inplace=True)

    # it is not uncommon to find templates that have empty columns
    template.dropna(how='all', axis=1, inplace=True)

    initial_columns.remove(index)
    dropped_cols = initial_columns - set(template.columns)
    if dropped_cols:
        warnings.warn(
            'The following column(s) were removed from the template because '
            'all their values are empty: %s' % ', '.join(dropped_cols),
            qdb.exceptions.QiitaDBWarning)

    # Pandas represents data with np.nan rather than Nones, change it to None
    # because psycopg2 knows that a None is a Null in SQL, while it doesn't
    # know what to do with NaN
    template = template.where((pd.notnull(template)), None)

    return template
示例#23
0
    def from_file(cls, ord_res_f):
        r"""Load ordination results from text file.

        Creates a `OrdinationResults` instance from serialized results
        stored as text.

        `ord_res_f` must be a file-like object containing text.

        The ord_res_f format should look like::

            Eigvals<tab>2
            0.096<tab>0.040

            Proportion explained<tab>2
            0.512<tab>0.488

            Species<tab>3<tab>2
            Species1<tab>0.408<tab>0.069
            Species2<tab>-0.115<tab>-0.299
            Species3<tab>-0.309<tab>0.187

            Site<tab>3<tab>2
            Site1<tab>-0.848<tab>0.882
            Site2<tab>-0.220<tab>-1.344
            Site3<tab>1.666<tab>0.470

            Biplot<tab>4<tab>3
            0.422<tab>-0.559<tab>-0.713
            0.988<tab>0.150<tab>-0.011
            -0.556<tab>0.817<tab>0.147
            -0.404<tab>-0.905<tab>-0.127

            Site constraints<tab>3<tab>2
            Site1<tab>-0.848<tab>0.882
            Site2<tab>-0.220<tab>-1.344
            Site3<tab>1.666<tab>0.470

        If a given result attribute is not present (e.g. Biplot), it should be
        still defined and declare its dimensions as 0::

            Biplot<tab>0<tab>0

        Parameters
        ----------
        ord_res_f : iterable of str or str
            Iterable of strings (e.g., open file handle, file-like object, list
            of strings, etc.) or a file path (a string) containing the
            serialized ordination results.

        Returns
        -------
        OrdinationResults
            Instance of type `cls` containing the parsed contents of
            `ord_res_f`.

        Raises
        ------
        ValueError
            if the shapes of the different sections of the file are not
            consistent
        FileFormatError
            if the format of the file is not recognized

        Examples
        --------
        Assume we have the following tab-delimited text file storing the
        ordination results::

            Eigvals\t2
            0.0961330159181\t0.0409418140138

            Proportion explained\t0

            Species\t3\t2
            Species1\t0.408869425742\t0.0695518116298
            Species2\t-0.1153860437\t-0.299767683538
            Species3\t-0.309967102571\t0.187391917117

            Site\t3\t2
            Site1\t-0.848956053187\t0.882764759014
            Site2\t-0.220458650578\t-1.34482000302
            Site3\t1.66697179591\t0.470324389808

            Biplot\t0\t0

            Site constraints\t0\t0

        Load the ordination results from the file:

        >>> from StringIO import StringIO
        >>> from skbio.stats.ordination import OrdinationResults
        >>> or_f = StringIO("Eigvals\t2\n"
        ...                 "0.0961330159181\t0.0409418140138\n"
        ...                 "\n"
        ...                 "Proportion explained\t0\n"
        ...                 "\n"
        ...                 "Species\t3\t2\n"
        ...                 "Species1\t0.408869425742\t0.0695518116298\n"
        ...                 "Species2\t-0.1153860437\t-0.299767683538\n"
        ...                 "Species3\t-0.309967102571\t0.187391917117\n"
        ...                 "\n"
        ...                 "Site\t3\t2\n"
        ...                 "Site1\t-0.848956053187\t0.882764759014\n"
        ...                 "Site2\t-0.220458650578\t-1.34482000302\n"
        ...                 "Site3\t1.66697179591\t0.470324389808\n"
        ...                 "\n"
        ...                 "Biplot\t0\t0\n"
        ...                 "\n"
        ...                 "Site constraints\t0\t0\n")
        >>> ord_res = OrdinationResults.from_file(or_f)
        """

        with open_file(ord_res_f, 'U') as fd:
            orf = iter(fd)

            # Starting at line 0, we should find the eigvals
            eigvals = cls._parse_eigvals(orf)
            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Now we should find the proportion explained section
            prop_expl = cls._parse_proportion_explained(orf)

            if prop_expl is not None:
                if len(prop_expl) != len(eigvals):
                    raise ValueError(
                        'There should be as many proportion explained'
                        ' values as eigvals: %d != %d' %
                        (len(prop_expl), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the species section
            species, species_ids = cls._parse_coords(orf, 'Species')
            if species is not None:
                if len(species[0]) != len(eigvals):
                    raise ValueError(
                        'There should be as many coordinates per'
                        ' species as eigvals: %d != %d' %
                        (len(species[0]), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the site section
            site, site_ids = cls._parse_coords(orf, 'Site')
            if site is not None:
                if len(site[0]) != len(eigvals):
                    raise ValueError(
                        'There should be as many coordinates per'
                        ' site as eigvals: %d != %d' %
                        (len(site[0]), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the biplot section
            biplot = cls._parse_biplot(orf)
            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the site constraints section
            cons, cons_ids = cls._parse_coords(orf, 'Site constraints')
            if cons_ids is not None and site_ids is not None:
                if cons_ids != site_ids:
                    raise ValueError(
                        'Site constraints ids and site ids must be'
                        ' equal: %s != %s' % (cons_ids, site_ids))

        return cls(eigvals=eigvals, species=species, site=site, biplot=biplot,
                   site_constraints=cons, proportion_explained=prop_expl,
                   species_ids=species_ids, site_ids=site_ids)
示例#24
0
 def test_BytesIO(self):
     """BytesIO (useful e.g. for testing) slips through."""
     f = BytesIO(b"File contents")
     with open_file(f) as fh:
         self.assertTrue(fh is f)
示例#25
0
 def test_BytesIO(self):
     """BytesIO (useful e.g. for testing) slips through."""
     f = BytesIO(b"File contents")
     with open_file(f) as fh:
         self.assertTrue(fh is f)
示例#26
0
文件: util.py 项目: jenwei/qiita
def load_template_to_dataframe(fn, strip_whitespace=True, index='sample_name'):
    """Load a sample/prep template or a QIIME mapping file into a data frame

    Parameters
    ----------
    fn : str or file-like object
        filename of the template to load, or an already open template file
    strip_whitespace : bool, optional
        Defaults to True. Whether or not to strip whitespace from values in the
        input file
    index : str, optional
        Defaults to 'sample_name'. The index to use in the loaded information

    Returns
    -------
    DataFrame
        Pandas dataframe with the loaded information

    Raises
    ------
    ValueError
        Empty file passed
    QiitaDBColumnError
        If the sample_name column is not present in the template.
        If there's a value in one of the reserved columns that cannot be cast
        to the needed type.
    QiitaDBWarning
        When columns are dropped because they have no content for any sample.
    QiitaDBError
        When non UTF-8 characters are found in the file.
    QiitaDBDuplicateHeaderError
        If duplicate columns are present in the template

    Notes
    -----
    The index attribute of the DataFrame will be forced to be 'sample_name'
    and will be cast to a string. Additionally rows that start with a '\t'
    character will be ignored and columns that are empty will be removed. Empty
    sample names will be removed from the DataFrame.

    The following table describes the data type per column that will be
    enforced in `fn`. Column names are case-insensitive but will be lowercased
    on addition to the database.

    +-----------------------+--------------+
    |      Column Name      |  Python Type |
    +=======================+==============+
    |           sample_name |          str |
    +-----------------------+--------------+
    |             #SampleID |          str |
    +-----------------------+--------------+
    |     physical_location |          str |
    +-----------------------+--------------+
    | has_physical_specimen |         bool |
    +-----------------------+--------------+
    |    has_extracted_data |         bool |
    +-----------------------+--------------+
    |           sample_type |          str |
    +-----------------------+--------------+
    |       host_subject_id |          str |
    +-----------------------+--------------+
    |           description |          str |
    +-----------------------+--------------+
    |              latitude |        float |
    +-----------------------+--------------+
    |             longitude |        float |
    +-----------------------+--------------+
    """
    # Load in file lines
    holdfile = None
    with open_file(fn, mode='U') as f:
        holdfile = f.readlines()
    if not holdfile:
        raise ValueError('Empty file passed!')

    # Strip all values in the cells in the input file, if requested
    if strip_whitespace:
        for pos, line in enumerate(holdfile):
            holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c")
                                      for d in line.split('\t'))

    # get and clean the controlled columns
    cols = holdfile[0].split('\t')
    controlled_cols = {'sample_name'}
    controlled_cols.update(CONTROLLED_COLS)
    holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
                            for c in cols)

    if index == "#SampleID":
        # We're going to parse a QIIME mapping file. We are going to first
        # parse it with the QIIME function so we can remove the comments
        # easily and make sure that QIIME will accept this as a mapping file
        data, headers, comments = _parse_mapping_file(holdfile)
        holdfile = ["%s\n" % '\t'.join(d) for d in data]
        holdfile.insert(0, "%s\n" % '\t'.join(headers))
        # The QIIME parser fixes the index and removes the #
        index = 'SampleID'

    # index_col:
    #   is set as False, otherwise it is cast as a float and we want a string
    # keep_default:
    #   is set as False, to avoid inferring empty/NA values with the defaults
    #   that Pandas has.
    # na_values:
    #   the values that should be considered as empty
    # true_values:
    #   the values that should be considered "True" for boolean columns
    # false_values:
    #   the values that should be considered "False" for boolean columns
    # converters:
    #   ensure that sample names are not converted into any other types but
    #   strings and remove any trailing spaces. Don't let pandas try to guess
    #   the dtype of the other columns, force them to be a str.
    # comment:
    #   using the tab character as "comment" we remove rows that are
    #   constituted only by delimiters i. e. empty rows.
    try:
        template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
                               encoding='utf-8', infer_datetime_format=True,
                               keep_default_na=False, na_values=NA_VALUES,
                               true_values=TRUE_VALUES,
                               false_values=FALSE_VALUES,
                               parse_dates=True, index_col=False, comment='\t',
                               mangle_dupe_cols=False, converters={
                                   index: lambda x: str(x).strip(),
                                   # required sample template information
                                   'physical_location': str,
                                   'sample_type': str,
                                   # collection_timestamp is not added here
                                   'host_subject_id': str,
                                   'description': str,
                                   # common prep template information
                                   'center_name': str,
                                   'center_projct_name': str})
    except UnicodeDecodeError:
        # Find row number and col number for utf-8 encoding errors
        headers = holdfile[0].strip().split('\t')
        errors = defaultdict(list)
        for row, line in enumerate(holdfile, 1):
            for col, cell in enumerate(line.split('\t')):
                try:
                    cell.encode('utf-8')
                except UnicodeError:
                    errors[headers[col]].append(row)
        lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows)))
                 for header, rows in viewitems(errors)]
        raise QiitaDBError('Non UTF-8 characters found in columns:\n' +
                           '\n'.join(lines))

    # Check that we don't have duplicate columns
    if len(set(template.columns)) != len(template.columns):
        raise QiitaDBDuplicateHeaderError(find_duplicates(template.columns))

    # let pandas infer the dtypes of these columns, if the inference is
    # not correct, then we have to raise an error
    columns_to_dtype = [(['latitude', 'longitude'], (np.int, np.float),
                         'integer or decimal'),
                        (['has_physical_specimen', 'has_extracted_data'],
                         np.bool_, 'boolean')]
    for columns, c_dtype, english_desc in columns_to_dtype:
        for n in columns:
            if n in template.columns and not all([isinstance(val, c_dtype)
                                                  for val in template[n]]):
                raise QiitaDBColumnError("The '%s' column includes values "
                                         "that cannot be cast into a %s "
                                         "value " % (n, english_desc))

    initial_columns = set(template.columns)

    if index not in template.columns:
        raise QiitaDBColumnError("The '%s' column is missing from "
                                 "your template, this file cannot be parsed."
                                 % index)

    # remove rows that have no sample identifier but that may have other data
    # in the rest of the columns
    template.dropna(subset=[index], how='all', inplace=True)

    # set the sample name as the index
    template.set_index(index, inplace=True)

    # it is not uncommon to find templates that have empty columns
    template.dropna(how='all', axis=1, inplace=True)

    initial_columns.remove(index)
    dropped_cols = initial_columns - set(template.columns)
    if dropped_cols:
        warnings.warn('The following column(s) were removed from the template '
                      'because all their values are empty: '
                      '%s' % ', '.join(dropped_cols), QiitaDBWarning)

    # Pandas represents data with np.nan rather than Nones, change it to None
    # because psycopg2 knows that a None is a Null in SQL, while it doesn't
    # know what to do with NaN
    template = template.where((pd.notnull(template)), None)

    return template
示例#27
0
    def to_file(self, out_f):
        """Save the ordination results to file in text format.

        Parameters
        ----------
        out_f : file-like object or filename
            File-like object to write serialized data to, or name of
            file. If it's a file-like object, it must have a ``write``
            method, and it won't be closed. Else, it is opened and
            closed after writing.

        See Also
        --------
        from_file

        """
        with open_file(out_f, 'w') as out_f:
            # Write eigvals
            out_f.write("Eigvals\t%d\n" % self.eigvals.shape)
            out_f.write("%s\n\n" % '\t'.join(np.asarray(self.eigvals,
                                                        dtype=np.str)))

            # Write proportion explained
            if self.proportion_explained is None:
                out_f.write("Proportion explained\t0\n\n")
            else:
                out_f.write("Proportion explained\t%d\n" %
                            self.proportion_explained.shape)
                out_f.write("%s\n\n" % '\t'.join(
                    np.asarray(self.proportion_explained, dtype=np.str)))

            # Write species
            if self.species is None:
                out_f.write("Species\t0\t0\n\n")
            else:
                out_f.write("Species\t%d\t%d\n" % self.species.shape)
                for id_, vals in zip(self.species_ids, self.species):
                    out_f.write("%s\t%s\n" % (id_, '\t'.join(np.asarray(vals,
                                dtype=np.str))))
                out_f.write("\n")

            # Write site
            if self.site is None:
                out_f.write("Site\t0\t0\n\n")
            else:
                out_f.write("Site\t%d\t%d\n" % self.site.shape)
                for id_, vals in zip(self.site_ids, self.site):
                    out_f.write("%s\t%s\n" % (id_, '\t'.join(
                        np.asarray(vals, dtype=np.str))))
                out_f.write("\n")

            # Write biplot
            if self.biplot is None:
                out_f.write("Biplot\t0\t0\n\n")
            else:
                out_f.write("Biplot\t%d\t%d\n" % self.biplot.shape)
                for vals in self.biplot:
                    out_f.write("%s\n" % '\t'.join(
                        np.asarray(vals, dtype=np.str)))
                out_f.write("\n")

            # Write site-constraints
            if self.site_constraints is None:
                out_f.write("Site constraints\t0\t0\n")
            else:
                out_f.write("Site constraints\t%d\t%d\n" %
                            self.site_constraints.shape)
                for id_, vals in zip(self.site_ids, self.site_constraints):
                    out_f.write("%s\t%s\n" % (id_, '\t'.join(
                        np.asarray(vals, dtype=np.str))))
示例#28
0
    def from_file(cls, dm_f, delimiter='\t'):
        """Load dissimilarity matrix from a delimited text file or file path.

        Creates a `DissimilarityMatrix` instance from a serialized
        dissimilarity matrix stored as delimited text.

        `dm_f` can be a file-like or a file path object containing delimited
        text. The first line (header) must contain the IDs of each object. The
        subsequent lines must contain an ID followed by each dissimilarity
        (float) between the current object and all other objects, where the
        order of objects is determined by the header line.  For example, a 2x2
        dissimilarity matrix with IDs ``'a'`` and ``'b'`` might look like::

            <del>a<del>b
            a<del>0.0<del>1.0
            b<del>1.0<del>0.0

        where ``<del>`` is the delimiter between elements.

        Parameters
        ----------
        dm_f : iterable of str or str
            Iterable of strings (e.g., open file handle, file-like object, list
            of strings, etc.) or a file path (a string) containing a serialized
            dissimilarity matrix.
        delimiter : str, optional
            String delimiting elements in `dm_f`.

        Returns
        -------
        DissimilarityMatrix
            Instance of type `cls` containing the parsed contents of `dm_f`.

        Notes
        -----
        Whitespace-only lines can occur anywhere throughout the "file" and are
        ignored. Lines starting with ``#`` are treated as comments and ignored.
        These comments can only occur *before* the ID header.

        IDs will have any leading/trailing whitespace removed when they are
        parsed.

        .. note::
            File-like objects passed to this method will not be closed upon the
            completion of the parsing, it is responsibility of the owner of the
            object to perform this operation.

        """
        # We aren't using np.loadtxt because it uses *way* too much memory
        # (e.g, a 2GB matrix eats up 10GB, which then isn't freed after parsing
        # has finished). See:
        # http://mail.scipy.org/pipermail/numpy-tickets/2012-August/006749.html

        with open_file(dm_f, 'U') as dm_f:

            # We use iter() as we want to take a single pass over the
            # iterable and maintain our current position after finding
            # the header (mainly necessary for something like a list
            # of strings).
            dm_f = iter(dm_f)

            # Strategy:
            #   - find the header
            #   - initialize an empty ndarray
            #   - for each row of data in the input file:
            #     - populate the corresponding row in the ndarray with floats

            ids = cls._parse_ids(dm_f, delimiter)
            num_ids = len(ids)
            data = np.empty((num_ids, num_ids), dtype=np.float64)

            # curr_row_idx keeps track of the row index within the data matrix.
            # We're not using enumerate() because there may be
            # empty/whitespace-only lines throughout the data matrix. We want
            # to ignore those and only count the actual rows of data.
            curr_row_idx = 0
            for line in dm_f:
                line = line.strip()

                if not line:
                    continue
                elif curr_row_idx >= num_ids:
                    # We've hit a nonempty line after we already filled the
                    # data matrix. Raise an error because we shouldn't ignore
                    # extra data.
                    raise DissimilarityMatrixFormatError(
                        "Encountered extra rows without corresponding IDs in"
                        " the header.")

                tokens = line.split(delimiter)

                # -1 because the first element contains the current ID.
                if len(tokens) - 1 != num_ids:
                    raise DissimilarityMatrixFormatError(
                        "There are %d values in row number %d, which is not"
                        " equal to the number of IDs in the header (%d)."
                        % (len(tokens) - 1, curr_row_idx + 1, num_ids))

                curr_id = tokens[0].strip()
                expected_id = ids[curr_row_idx]
                if curr_id == expected_id:
                    data[curr_row_idx, :] = np.asarray(tokens[1:], dtype=float)
                else:
                    raise DissimilarityMatrixFormatError(
                        "Encountered mismatched IDs while parsing the "
                        "dissimilarity matrix file. Found '%s' but expected "
                        "'%s'. Please ensure that the IDs match between the "
                        "dissimilarity matrix header (first row) and the row "
                        "labels (first column)." % (curr_id, expected_id))

                curr_row_idx += 1

        if curr_row_idx != num_ids:
            raise DissimilarityMatrixFormatError(
                "Expected %d row(s) of data, but found %d." % (num_ids,
                                                               curr_row_idx))

        return cls(data, ids)
示例#29
0
文件: util.py 项目: aashish24/qiita
def load_template_to_dataframe(fn, strip_whitespace=True):
    """Load a sample or a prep template into a data frame

    Parameters
    ----------
    fn : str or file-like object
        filename of the template to load, or an already open template file
    strip_whitespace : bool, optional
        Defaults to True. Whether or not to strip whitespace from values in the
        input file

    Returns
    -------
    DataFrame
        Pandas dataframe with the loaded information

    Raises
    ------
    ValueError
        Empty file passed
    QiitaDBColumnError
        If the sample_name column is not present in the template.
        If there's a value in one of the reserved columns that cannot be cast
        to the needed type.
    QiitaDBWarning
        When columns are dropped because they have no content for any sample.

    Notes
    -----
    The index attribute of the DataFrame will be forced to be 'sample_name'
    and will be cast to a string. Additionally rows that start with a '\t'
    character will be ignored and columns that are empty will be removed. Empty
    sample names will be removed from the DataFrame.

    The following table describes the data type per column that will be
    enforced in `fn`. Column names are case-insensitive but will be lowercased
    on addition to the database.

    +-----------------------+--------------+
    |      Column Name      |  Python Type |
    +=======================+==============+
    |           sample_name |          str |
    +-----------------------+--------------+
    |     physical_location |          str |
    +-----------------------+--------------+
    | has_physical_specimen |         bool |
    +-----------------------+--------------+
    |    has_extracted_data |         bool |
    +-----------------------+--------------+
    |           sample_type |          str |
    +-----------------------+--------------+
    |       host_subject_id |          str |
    +-----------------------+--------------+
    |           description |          str |
    +-----------------------+--------------+
    |              latitude |        float |
    +-----------------------+--------------+
    |             longitude |        float |
    +-----------------------+--------------+
    """
    # Load in file lines
    holdfile = None
    with open_file(fn) as f:
        holdfile = f.readlines()
    if not holdfile:
        raise ValueError('Empty file passed!')

    # Strip all values in the cells in the input file, if requested
    if strip_whitespace:
        for pos, line in enumerate(holdfile):
            holdfile[pos] = '\t'.join(d.strip(" \r\x0b\x0c")
                                      for d in line.split('\t'))

    # get and clean the controlled columns
    cols = holdfile[0].split('\t')
    controlled_cols = {'sample_name'}
    controlled_cols.update(CONTROLLED_COLS)
    holdfile[0] = '\t'.join(c.lower() if c.lower() in controlled_cols else c
                            for c in cols)
    # index_col:
    #   is set as False, otherwise it is cast as a float and we want a string
    # keep_default:
    #   is set as False, to avoid inferring empty/NA values with the defaults
    #   that Pandas has.
    # na_values:
    #   the values that should be considered as empty, in this case only empty
    #   strings.
    # converters:
    #   ensure that sample names are not converted into any other types but
    #   strings and remove any trailing spaces. Don't let pandas try to guess
    #   the dtype of the other columns, force them to be a str.
    # comment:
    #   using the tab character as "comment" we remove rows that are
    #   constituted only by delimiters i. e. empty rows.
    template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
                           infer_datetime_format=True,
                           keep_default_na=False, na_values=[''],
                           parse_dates=True, index_col=False, comment='\t',
                           mangle_dupe_cols=False, converters={
                               'sample_name': lambda x: str(x).strip(),
                               # required_sample_info
                               'physical_location': str,
                               'sample_type': str,
                               # collection_timestamp is not added here
                               'host_subject_id': str,
                               'description': str,
                               # common_prep_info
                               'center_name': str,
                               'center_projct_name': str})

    # let pandas infer the dtypes of these columns, if the inference is
    # not correct, then we have to raise an error
    columns_to_dtype = [(['latitude', 'longitude'], (np.int, np.float),
                         'integer or decimal'),
                        (['has_physical_specimen', 'has_extracted_data'],
                         np.bool_, 'boolean')]
    for columns, c_dtype, english_desc in columns_to_dtype:
        for n in columns:
            if n in template.columns and not all([isinstance(val, c_dtype)
                                                  for val in template[n]]):
                raise QiitaDBColumnError("The '%s' column includes values "
                                         "that cannot be cast into a %s "
                                         "value " % (n, english_desc))

    initial_columns = set(template.columns)

    if 'sample_name' not in template.columns:
        raise QiitaDBColumnError("The 'sample_name' column is missing from "
                                 "your template, this file cannot be parsed.")

    # remove rows that have no sample identifier but that may have other data
    # in the rest of the columns
    template.dropna(subset=['sample_name'], how='all', inplace=True)

    # set the sample name as the index
    template.set_index('sample_name', inplace=True)

    # it is not uncommon to find templates that have empty columns
    template.dropna(how='all', axis=1, inplace=True)

    initial_columns.remove('sample_name')
    dropped_cols = initial_columns - set(template.columns)
    if dropped_cols:
        warnings.warn('The following column(s) were removed from the template '
                      'because all their values are empty: '
                      '%s' % ', '.join(dropped_cols), QiitaDBWarning)

    return template
示例#30
0
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33):
    r"""yields label, seq, and qual from a fastq file.

    Parameters
    ----------
    data : open file object or str
        An open fastq file (opened in binary mode) or a path to it.
    strict : bool, optional
        Defaults to ``False``. If strict is true a FastqParse error will be
        raised if the seq and qual labels dont' match.
    enforce_qual_range : bool, optional
        Defaults to ``True``. If ``True``, an exception will be raised if a
        quality score outside the range [0, 62] is detected
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols to integers

    Returns
    -------
    label, seq, qual : (str, bytes, np.array)
        yields the label, sequence and quality for each entry

    Examples
    --------
    Assume we have a fastq formatted file with the following contents::

        @seq1
        AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
        +
        ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF
        @seq2
        TATGTATATATAACATATACATATATACATACATA
        +
        ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb

    We can use the following code:

    >>> from StringIO import StringIO
    >>> from skbio.parse.sequences import parse_fastq
    >>> fastq_f = StringIO('@seq1\n'
    ...                     'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n'
    ...                     '+\n'
    ...                     '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n'
    ...                     '@seq2\n'
    ...                     'TATGTATATATAACATATACATATATACATACATA\n'
    ...                     '+\n'
    ...                     ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n')
    >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64):
    ...     print(label)
    ...     print(seq)
    ...     print(qual)
    seq1
    AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
    [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35
     35 25 20 28 20 28 25 28 23  6]
    seq2
    TATGTATATATAACATATACATATATACATACATA
    [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32
     35 32 28 33 20 32 32 34 34 34]
    """
    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    with open_file(data, 'rb') as data:
        iters = [iter(data)] * 4
        for seqid, seq, qualid, qual in zip_longest(*iters):
            seqid = seqid.strip()
            # If the file simply ended in a blankline, do not error
            if seqid is '':
                continue
            # Error if an incomplete record is found
            # Note: seqid cannot be None, because if all 4 values were None,
            # then the loop condition would be false, and we could not have
            # gotten to this point
            if seq is None or qualid is None or qual is None:
                raise FastqParseError("Incomplete FASTQ record found at end "
                                      "of file")

            seq = seq.strip()
            qualid = qualid.strip()
            qual = qual.strip()

            seqid = _drop_id_marker(seqid)

            try:
                seq = str(seq.decode("utf-8"))
            except AttributeError:
                pass

            qualid = _drop_id_marker(qualid)
            if strict:
                if seqid != qualid:
                    raise FastqParseError('ID mismatch: {} != {}'.format(
                        seqid, qualid))

            # bounds based on illumina limits, see:
            # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
            qual = phred_f(qual)
            if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()):
                raise FastqParseError("Failed qual conversion for seq id: %s. "
                                      "This may be because you passed an "
                                      "incorrect value for phred_offset." %
                                      seqid)

            yield (seqid, seq, qual)