예제 #1
0
def to_ascii_file(demux_fp, output_fp, samples=None, out_format='fastq'):
    """Writes the sequences on FASTQ or FASTA format

    Parameters
    ----------
    demux_fp : str
        The demux file path
    output_fp : str
        The output file path
    samples : list of str, optional
        Samples to pull out. If None, then all samples will be examined.
        Defaults to None.
    out_format: {'fastq', 'fasta'}, optional
        The format in which the output file should be written. Default: FASTQ

    Raises
    ------
    ValueError
        If `out_format` is not 'fastq' or 'fasta'
    """
    if out_format == 'fastq':
        formatter = format_fastq_record
    elif out_format == 'fasta':
        formatter = format_fasta_record
    else:
        raise ValueError("'out_format' should be either 'fastq' or 'fasta', "
                         "found: %s" % out_format)

    with open_file(demux_fp, 'r') as demux:
        if samples is None:
            samples = list(demux.keys())
        samples = [s.encode() for s in samples]
        with open(output_fp, 'wb') as out:
            for rec in _to_ascii(demux, samples, formatter):
                out.write(rec)
예제 #2
0
def stats(demux):
    """Return file stats

    Parameters
    ----------
    demux : {str, h5py.File, h5py.Group}
        The file or group to get stats from

    Returns
    -------
    stat
        The corresponding stats
    """
    with open_file(demux) as fh:
        attrs = fh.attrs
        obs_stats = stat(n=attrs['n'],
                         max=attrs['max'],
                         min=attrs['min'],
                         std=attrs['std'],
                         mean=attrs['mean'],
                         median=attrs['median'],
                         hist=attrs['hist'],
                         hist_edge=attrs['hist_edge'])

    return obs_stats
예제 #3
0
 def test_filehandle(self):
     """Filehandles slip through untouched"""
     with tempfile.TemporaryFile('r') as fh:
         with open_file(fh) as ffh:
             self.assertTrue(fh is ffh)
         # And it doesn't close the file-handle
         self.assertFalse(fh.closed)
예제 #4
0
 def test_file_closed(self):
     """File gets closed in decorator"""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     with open_file(filepath) as fh:
         pass
     self.assertTrue(fh.closed)
예제 #5
0
    def test_hdf5IO_open(self):
        name = None
        with tempfile.NamedTemporaryFile(delete=False) as fh:
            name = fh.name
            fh.close()

            h5file = h5py.File(name, 'w')
            h5file.close()

            with open_file(name) as fh_inner:
                self.assertTrue(isinstance(fh_inner, h5py.File))

        os.remove(name)
예제 #6
0
 def test_file_closed_harder(self):
     """File gets closed in decorator, even if exceptions happen."""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     try:
         with open_file(filepath) as fh:
             raise TypeError
     except TypeError:
         self.assertTrue(fh.closed)
     else:
         # If we're here, no exceptions have been raised inside the
         # try clause, so the context manager swallowed them. No
         # good.
         raise Exception("`open_file` didn't propagate exceptions")
예제 #7
0
def to_per_sample_files(demux_fp,
                        samples=None,
                        out_dir='./',
                        n_jobs=1,
                        out_format='fastq'):
    """Writes per sample files

    Parameters
    ----------
    demux_fp : str
        The demux file path
    samples : list of str, optional
        Samples to pull out. If None, then all samples will be examined.
        Defaults to None.
    out_dir : str, optional
        Path to output directory to store the per sample fasta.
        Defaults to current directory
    n_jobs : int, optional
        Number of jobs to run in parallel. Defaults to 1
    out_format : {'fastq', 'fasta'}
        The format in which the output files should be written.
    """
    if out_format == 'fastq':
        formatter = format_fastq_record
        file_name_fmt = "%s.fastq"
    elif out_format == 'fasta':
        formatter = format_fasta_record
        file_name_fmt = "%s.fna"
    else:
        raise ValueError("'out_format' should be either 'fastq' or 'fasta', "
                         "found: %s" % out_format)
    if samples is None:
        with open_file(demux_fp, 'r') as demux:
            # We need to call list because demux.keys() is a KeysView object
            # from the file, and the file will be closed once we exit the
            # context manager
            samples = list(demux.keys())

    if out_dir is None:
        out_dir = './'

    path_builder = partial(os.path.join, out_dir)
    samples_and_paths = [(s.encode(), path_builder(file_name_fmt % s))
                         for s in samples]

    with joblib.Parallel(n_jobs=n_jobs) as par:
        par(
            joblib.delayed(_to_file)(demux_fp, sample, s_fp, formatter)
            for sample, s_fp in samples_and_paths)
예제 #8
0
파일: fasta.py 프로젝트: wasade/qiita-files
    def parser(lines):
        with open_file(lines) as lines:
            curr = []
            for l in lines:
                try:
                    l = str(l.decode('utf-8'))
                except AttributeError:
                    pass

                if constructor is not None:
                    line = constructor(l)
                else:
                    line = l
                if ignore(line):
                    continue
                # if we find the label, return the previous record
                if is_label_line(line):
                    if curr:
                        yield curr
                        curr = []
                curr.append(line)
            # don't forget to return the last record in the file
            if curr:
                yield curr
예제 #9
0
def _validate_demux_file(qclient,
                         job_id,
                         prep_info,
                         out_dir,
                         demux_fp,
                         fastq_fp=None,
                         fasta_fp=None,
                         log_fp=None):
    """Validate and fix a 'demux' file and regenerate fastq and fasta files

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    out_dir : str
        The output directory
    demux_fp : str
        The demux file path
    fastq_fp : str, optional
        The original fastq filepath. If demux is correct, it will not be
        regenerated
    fasta_fp : str, optional
        The original fasta filepath. If demux is correct, it will no be
        regenerated
    log_fp : str, optional
        The original log filepath

    Returns
    -------
    dict
        The results og the job
    """
    pt_sample_ids = set(prep_info)
    with open_file(demux_fp) as f:
        demux_sample_ids = set(f.keys())

    if not pt_sample_ids.issuperset(demux_sample_ids):
        # The demux sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing sample ids")
        # Atempt 1: the user provided the run prefix column - in this case the
        # run prefix column holds the sample ids present in the demux file
        if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]:
            id_map = {v['run_prefix']: k for k, v in prep_info.items()}
            if not set(id_map).issuperset(demux_sample_ids):
                error_msg = ('The sample ids in the "run_prefix" columns '
                             'from the prep information do not match the '
                             'ones in the demux file. Please, correct the '
                             'column "run_prefix" in the prep information to '
                             'map the existing sample ids to the prep '
                             'information sample ids.')
                return False, None, error_msg
        else:
            # Attempt 2: the sample ids in the demux table are the same that
            # in the prep template but without the prefix
            prefix = next(iter(pt_sample_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids)
            if pt_sample_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids}
            else:
                # There is nothing we can do. The samples in the demux file do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the demultiplexed files do '
                             'not match the ones in the prep information. '
                             'Please, provide the column "run_prefix" in '
                             'the prep information to map the existing sample'
                             ' ids to the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        # Do not modify the original demux file, copy it to a new location
        new_demux_fp = join(out_dir, basename(demux_fp))
        # this if is important so we don't regenerate the demux file if the
        # user uploads fastq or fna
        if demux_fp != new_demux_fp:
            copy(demux_fp, new_demux_fp)
            demux_fp = new_demux_fp

        with open_file(demux_fp, 'r+') as f:
            for old in f:
                f.move(old, id_map[old])
        # When we fix, we always generate the FASTQ and FASTA file
        # By setting them to None, below will be generated
        fastq_fp = None
        fasta_fp = None

    # If we didn't fix anything, we only generate the files if they don't
    # already exists
    name = splitext(basename(demux_fp))[0]
    if not fastq_fp:
        fastq_fp = join(out_dir, "%s.fastq" % name)
        to_ascii_file(demux_fp, fastq_fp, out_format='fastq')
        fastq_fp, error_msg = _gzip_file(fastq_fp)
        if error_msg is not None:
            return False, None, error_msg

    if not fasta_fp:
        fasta_fp = join(out_dir, "%s.fasta" % name)
        to_ascii_file(demux_fp, fasta_fp, out_format='fasta')
        fasta_fp, error_msg = _gzip_file(fasta_fp)
        if error_msg is not None:
            return False, None, error_msg

    filepaths = [(fastq_fp, 'preprocessed_fastq'),
                 (fasta_fp, 'preprocessed_fasta'),
                 (demux_fp, 'preprocessed_demux')]
    if log_fp:
        filepaths.append((log_fp, 'log'))
    return True, [ArtifactInfo(None, 'Demultiplexed', filepaths)], ""
예제 #10
0
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir):
    """Validate and fix a new 'Demultiplexed' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    out_dir : str
        The output directory

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files")

    supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq',
                          'preprocessed_demux', 'log'}
    unsupported_fp_types = set(files) - supported_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact type "
                     "Demultiplexed. Supported filepath types: %s"
                     % (', '.join(unsupported_fp_types),
                        ', '.join(sorted(supported_fp_types))))
        return False, None, error_msg

    # At most one file of each type can be provided
    offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1)
    if offending:
        errors = ["%s (%d): %s"
                  % (fp_t, len(files[fp_t]), ', '.join(files[fp_t]))
                  for fp_t in sorted(offending)]
        error_msg = ("Only one filepath of each file type is supported, "
                     "offending types:\n%s" % "; ".join(errors))
        return False, None, error_msg

    # Check which files we have available:
    fasta = (files['preprocessed_fasta'][0]
             if 'preprocessed_fasta' in files else None)
    fastq = (files['preprocessed_fastq'][0]
             if 'preprocessed_fastq' in files else None)
    demux = (files['preprocessed_demux'][0]
             if 'preprocessed_demux' in files else None)
    log = (files['log'][0] if 'log' in files else None)
    if demux:
        # If demux is available, use that one to perform the validation and
        # generate the fasta and fastq from it
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, log_fp=log)
    elif fastq:
        # Generate the demux file from the fastq
        demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0])
        with open_file(demux, "w") as f:
            to_hdf5(fastq, f)
        # Validate the demux, providing the original fastq
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq,
            log_fp=log)
    elif fasta:
        # Generate the demux file from the fasta
        demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0])
        with open_file(demux, "w") as f:
            to_hdf5(fasta, f)
        # Validate the demux, providing the original fasta
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta,
            log_fp=log)
    else:
        error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or "
                     "'preprocessed_fasta' file should be provided.")
        return False, None, error_msg

    return success, a_info, error_msg
예제 #11
0
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp,
                         fastq_fp=None, fasta_fp=None, log_fp=None):
    """Validate and fix a 'demux' file and regenerate fastq and fasta files

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    out_dir : str
        The output directory
    demux_fp : str
        The demux file path
    fastq_fp : str, optional
        The original fastq filepath. If demux is correct, it will not be
        regenerated
    fasta_fp : str, optional
        The original fasta filepath. If demux is correct, it will no be
        regenerated
    log_fp : str, optional
        The original log filepath

    Returns
    -------
    dict
        The results og the job
    """
    pt_sample_ids = set(prep_info)
    with open_file(demux_fp) as f:
        demux_sample_ids = set(f.keys())

    if not pt_sample_ids.issuperset(demux_sample_ids):
        # The demux sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing sample ids")
        # Atempt 1: the user provided the run prefix column - in this case the
        # run prefix column holds the sample ids present in the demux file
        if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]:
            id_map = {v['run_prefix']: k for k, v in prep_info.items()}
            if not set(id_map).issuperset(demux_sample_ids):
                error_msg = ('The sample ids in the "run_prefix" columns '
                             'from the prep information do not match the '
                             'ones in the demux file. Please, correct the '
                             'column "run_prefix" in the prep information to '
                             'map the existing sample ids to the prep '
                             'information sample ids.')
                return False, None, error_msg
        else:
            # Attempt 2: the sample ids in the demux table are the same that
            # in the prep template but without the prefix
            prefix = next(iter(pt_sample_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids)
            if pt_sample_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids}
            else:
                # There is nothing we can do. The samples in the demux file do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the demultiplexed files do '
                             'not match the ones in the prep information. '
                             'Please, provide the column "run_prefix" in '
                             'the prep information to map the existing sample'
                             ' ids to the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        # Do not modify the original demux file, copy it to a new location
        new_demux_fp = join(out_dir, basename(demux_fp))
        copy(demux_fp, new_demux_fp)
        # Need to catch an error
        with open_file(new_demux_fp, 'r+') as f:
            for old in f:
                f.move(old, id_map[old])

        # When we fix, we always generate the FASTQ and FASTA file
        # By setting them to None, below will be generated
        demux_fp = new_demux_fp
        fastq_fp = None
        fasta_fp = None

    # If we didn't fix anything, we only generate the files if they don't
    # already exists
    name = splitext(basename(demux_fp))[0]
    if not fastq_fp:
        fastq_fp = join(out_dir, "%s.fastq" % name)
        to_ascii_file(demux_fp, fastq_fp, out_format='fastq')

    if not fasta_fp:
        fasta_fp = join(out_dir, "%s.fasta" % name)
        to_ascii_file(demux_fp, fasta_fp, out_format='fasta')

    filepaths = [(fastq_fp, 'preprocessed_fastq'),
                 (fasta_fp, 'preprocessed_fasta'),
                 (demux_fp, 'preprocessed_demux')]
    if log_fp:
        filepaths.append((log_fp, 'log'))
    return True, [ArtifactInfo(None, 'Demultiplexed', filepaths)], ""
예제 #12
0
def _to_file(demux_fp, sample, fp, formatter):
    with open_file(demux_fp, 'r') as demux:
        with open(fp, 'wb') as out:
            for rec in _to_ascii(demux, [sample], formatter):
                out.write(rec)
예제 #13
0
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33):
    r"""yields label, seq, and qual from a fastq file.

    Parameters
    ----------
    data : open file object or str
        An open fastq file (opened in binary mode) or a path to it.
    strict : bool, optional
        Defaults to ``False``. If strict is true a FastqParse error will be
        raised if the seq and qual labels dont' match.
    enforce_qual_range : bool, optional
        Defaults to ``True``. If ``True``, an exception will be raised if a
        quality score outside the range [0, 62] is detected
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols to integers

    Returns
    -------
    label, seq, qual : (str, bytes, np.array)
        yields the label, sequence and quality for each entry
    """

    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    with open_file(data, 'rb') as data:
        iters = [iter(data)] * 4
        for seqid, seq, qualid, qual in zip_longest(*iters):
            seqid = seqid.strip()
            # If the file simply ended in a blankline, do not error
            if seqid == b'':
                continue
            # Error if an incomplete record is found
            # Note: seqid cannot be None, because if all 4 values were None,
            # then the loop condition would be false, and we could not have
            # gotten to this point
            if seq is None or qualid is None or qual is None:
                raise ValueError("Incomplete FASTQ record found at end "
                                 "of file")

            seq = seq.strip()
            qualid = qualid.strip()
            qual = qual.strip()

            seqid = _drop_id_marker(seqid)

            try:
                seq = str(seq.decode("utf-8"))
            except AttributeError:
                pass

            qualid = _drop_id_marker(qualid)
            if strict:
                if seqid != qualid:
                    raise ValueError('ID mismatch: {} != {}'.format(
                        seqid, qualid))

            # bounds based on illumina limits, see:
            # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
            qual = phred_f(qual)
            if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()):
                raise ValueError("Failed qual conversion for seq id: %s. "
                                 "This may be because you passed an incorrect "
                                 "value for phred_offset." % seqid)

            yield (seqid, seq, qual)
예제 #14
0
 def test_hdf5IO(self):
     f = h5py.File('test', mode='w', driver='core', backing_store=False)
     with open_file(f) as fh:
         self.assertTrue(fh is f)
예제 #15
0
 def test_BytesIO(self):
     """BytesIO (useful e.g. for testing) slips through."""
     f = BytesIO(b"File contents")
     with open_file(f) as fh:
         self.assertTrue(fh is f)
예제 #16
0
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir):
    """Validate and fix a new 'Demultiplexed' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    out_dir : str
        The output directory

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files")

    supported_fp_types = {
        'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log'
    }
    unsupported_fp_types = set(files) - supported_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact type "
                     "Demultiplexed. Supported filepath types: %s" %
                     (', '.join(unsupported_fp_types), ', '.join(
                         sorted(supported_fp_types))))
        return False, None, error_msg

    # At most one file of each type can be provided
    offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1)
    if offending:
        errors = [
            "%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t]))
            for fp_t in sorted(offending)
        ]
        error_msg = ("Only one filepath of each file type is supported, "
                     "offending types:\n%s" % "; ".join(errors))
        return False, None, error_msg

    # Check which files we have available:
    fasta = (files['preprocessed_fasta'][0]
             if 'preprocessed_fasta' in files else None)
    fastq = (files['preprocessed_fastq'][0]
             if 'preprocessed_fastq' in files else None)
    demux = (files['preprocessed_demux'][0]
             if 'preprocessed_demux' in files else None)
    log = (files['log'][0] if 'log' in files else None)
    if demux:
        # If demux is available, use that one to perform the validation and
        # generate the fasta and fastq from it
        success, a_info, error_msg = _validate_demux_file(qclient,
                                                          job_id,
                                                          prep_info,
                                                          out_dir,
                                                          demux,
                                                          log_fp=log)
    elif fastq:
        # Generate the demux file from the fastq
        demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0])
        with open_file(demux, "w") as f:
            to_hdf5(fastq, f)
        # Validate the demux, providing the original fastq
        success, a_info, error_msg = _validate_demux_file(qclient,
                                                          job_id,
                                                          prep_info,
                                                          out_dir,
                                                          demux,
                                                          fastq_fp=fastq,
                                                          log_fp=log)
    elif fasta:
        # Generate the demux file from the fasta
        demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0])
        with open_file(demux, "w") as f:
            to_hdf5(fasta, f)
        # Validate the demux, providing the original fasta
        success, a_info, error_msg = _validate_demux_file(qclient,
                                                          job_id,
                                                          prep_info,
                                                          out_dir,
                                                          demux,
                                                          fasta_fp=fasta,
                                                          log_fp=log)
    else:
        error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or "
                     "'preprocessed_fasta' file should be provided.")
        return False, None, error_msg

    return success, a_info, error_msg