Пример #1
0
def dispatcher(success_fp, fail_fp, partitions):
    """Dispatch execution over a pool of processors

    Parameters
    ----------
    success_fp : file-like object
        A file-like object to write a list of successful sample IDs too
    fail_fp : file-like object
        A file-like object to write a list of unsuccessful sample IDs too,
        and any associated error information
    partitions : Iterable of (function, Iterable of str)
        Yields a function and an iterable of IDs. It is expected that the
        functions yielded will have the following signature:

        {str: list} <- function(list of str)
    """
    if ag.is_test_env():
        logger = mp.log_to_stderr()
        logger.setLevel(logging.INFO)

    pool = mp.Pool(processes=agenv.get_cpu_count())

    success_fp.write('%s\n' % '#SampleID')
    fail_fp.write('%s\t%s\n' % ('#SampleID', 'Error(s)'))

    for func, ids in partitions:
        functor = partial(run_functor, func)
        for success_details in pool.map(functor, list(agru.chunk_list(ids))):
            for id_, detail in success_details.items():
                if detail:
                    fail_fp.write("%s\t%s\n" % (id_, '\t'.join(detail)))
                else:
                    success_fp.write("%s\n" % id_)
Пример #2
0
def fetch_study(study_accession, base_dir):
    """Fetch and dump a study

    Grab and dump a study.  If sample_accessions
    are specified, then only those specified samples

    will be fetched and dumped

    Parameters
    ----------
    study_accession : str
       Accession ID for the study
    base_dir : str
       Path of base directory to save the fetched results

    Note
    ----
    If sample_accession is None, then the entire study will be fetched
    """
    if ag.is_test_env():
        return 0

    study_dir = os.path.join(base_dir, study_accession)

    if ag.staged_raw_data() is not None:
        os.symlink(ag.staged_raw_data(), study_dir)
    elif not os.path.exists(study_dir):
        os.mkdir(study_dir)

    new_samples = 0

    for sample, fastq_url in fetch_study_details(study_accession):
        sample_dir = os.path.join(study_dir, sample)
        if not os.path.exists(sample_dir):
            # fetch files if it isn't already present
            os.mkdir(sample_dir)
            metadata_path = os.path.join(sample_dir,
                                         '%s.txt' % sample)
            fasta_path = os.path.join(sample_dir,
                                      '%s.fna' % sample)
            # write out fasta
            with open(fasta_path, 'w') as fasta_out:
                for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)):
                    fasta_out.write(">%s\n%s\n" % (id_, seq))
            # write mapping xml
            url_fmt = "http://www.ebi.ac.uk/ena/data/view/" + \
                      "%(accession)s&display=xml"
            res = fetch_url(url_fmt % {'accession': sample})
            with open(metadata_path, 'w') as md_f:
                md_f.write(res.read())

            new_samples += 1
    return new_samples
Пример #3
0
def fetch_study(study_accession, base_dir):
    """Fetch and dump a study

    Grab and dump a study.  If sample_accessions
    are specified, then only those specified samples

    will be fetched and dumped

    Parameters
    ----------
    study_accession : str
       Accession ID for the study
    base_dir : str
       Path of base directory to save the fetched results

    Note
    ----
    If sample_accession is None, then the entire study will be fetched
    """
    if ag.is_test_env():
        return 0

    study_dir = os.path.join(base_dir, study_accession)

    if ag.staged_raw_data() is not None:
        os.symlink(ag.staged_raw_data(), study_dir)
    elif not os.path.exists(study_dir):
        os.mkdir(study_dir)

    new_samples = 0

    for sample, fastq_url in fetch_study_details(study_accession):
        sample_dir = os.path.join(study_dir, sample)
        if not os.path.exists(sample_dir):
            # fetch files if it isn't already present
            os.mkdir(sample_dir)
            metadata_path = os.path.join(sample_dir,
                                         '%s.txt' % sample)
            fasta_path = os.path.join(sample_dir,
                                      '%s.fna' % sample)
            # write out fasta
            with open(fasta_path, 'w') as fasta_out:
                for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)):
                    fasta_out.write(">%s\n%s\n" % (id_, seq))
            # write mapping xml
            url_fmt = "http://www.ebi.ac.uk/ena/data/view/" + \
                      "%(accession)s&display=xml"
            res = fetch_url(url_fmt % {'accession': sample})
            with open(metadata_path, 'w') as md_f:
                md_f.write(res.read())

            new_samples += 1
    return new_samples
def get_reference_set():
    """Get the reference set to use for OTU picking

    Returns
    -------
    str
        The file path to the reference sequences.
    str
        The file path to the reference taxonomy.
    """
    if ag.is_test_env():
        repo = get_repository_dir()
        ref_seqs = os.path.join(repo, 'tests/data/otus.fna')
        ref_tax = os.path.join(repo, 'tests/data/otus.txt')
        return ref_seqs, ref_tax
    else:
        return qdr.get_reference_sequences(), qdr.get_reference_taxonomy()
Пример #5
0
def get_reference_set():
    """Get the reference set to use for OTU picking

    Returns
    -------
    str
        The file path to the reference sequences.
    str
        The file path to the reference taxonomy.
    """
    if ag.is_test_env():
        repo = get_repository_dir()
        ref_seqs = os.path.join(repo, 'tests/data/otus.fna')
        ref_tax = os.path.join(repo, 'tests/data/otus.txt')
        return ref_seqs, ref_tax
    else:
        return qdr.get_reference_sequences(), qdr.get_reference_taxonomy()
def get_study_accessions():
    """Get the accessions to use, or redirect to test data

    Returns
    -------
    list of str
        The accessions, which are expected to be basenames for the actual data.
        For instance, the accession "foo" would have sequences as "foo.fna" and
        metadata as "foo.txt".

    Notes
    -----
    If $AG_TESTING == 'True', then the accessions returned will
    correspond to the test dataset.
    """
    if ag.is_test_env():
        _stage_test_accessions()
        return _TEST_ACCESSIONS[:]
    else:
        return _EBI_ACCESSIONS[:]
Пример #7
0
def get_study_accessions():
    """Get the accessions to use, or redirect to test data

    Returns
    -------
    list of str
        The accessions, which are expected to be basenames for the actual data.
        For instance, the accession "foo" would have sequences as "foo.fna" and
        metadata as "foo.txt".

    Notes
    -----
    If $AG_TESTING == 'True', then the accessions returned will
    correspond to the test dataset.
    """
    if ag.is_test_env():
        _stage_test_accessions()
        return _TEST_ACCESSIONS[:]
    else:
        return _EBI_ACCESSIONS[:]
def _get_data(data_dir, tag):
    """Get a non-AG table and mapping file

    Parameters
    ----------
    data_dir : str
        The base data path
    tag : str
        The filetag (e.g., HMPv35_100nt)

    Returns
    -------
    (str, str)
        The filepath to the table, and the filepath to the mapping file.

    Notes
    -----
    If $AG_TESTING == 'True', then the data returned will correspond to the
    test dataset.

    Raises
    ------
    IOError
        If the filepaths are not accessible
    """
    repo = get_repository_dir()
    data = 'tests/data' if ag.is_test_env() else 'data'
    base = os.path.join(repo, data)

    table = os.path.join(base, data_dir, '%s.biom' % tag)
    mapping = os.path.join(base, data_dir, '%s.txt' % tag)

    if not os.path.exists(table):
        raise IOError("Unable to access: %s" % table)
    if not os.path.exists(mapping):
        raise IOError("Unable to access: %s" % table)

    return table, mapping
Пример #9
0
def _get_data(data_dir, tag):
    """Get a non-AG table and mapping file

    Parameters
    ----------
    data_dir : str
        The base data path
    tag : str
        The filetag (e.g., HMPv35_100nt)

    Returns
    -------
    (str, str)
        The filepath to the table, and the filepath to the mapping file.

    Notes
    -----
    If $AG_TESTING == 'True', then the data returned will correspond to the
    test dataset.

    Raises
    ------
    IOError
        If the filepaths are not accessible
    """
    repo = get_repository_dir()
    data = 'tests/data' if ag.is_test_env() else 'data'
    base = os.path.join(repo, data)

    table = os.path.join(base, data_dir, '%s.biom' % tag)
    mapping = os.path.join(base, data_dir, '%s.txt' % tag)

    if not os.path.exists(table):
        raise IOError("Unable to access: %s" % table)
    if not os.path.exists(mapping):
        raise IOError("Unable to access: %s" % table)

    return table, mapping
def get_rarefaction_depth():
    """Return the rarefaction depth to use"""
    if ag.is_test_env():
        return "100"
    else:
        return "1000"
Пример #11
0
def get_rarefaction_depth():
    """Return the rarefaction depth to use"""
    if ag.is_test_env():
        return "100"
    else:
        return "1000"