예제 #1
0
def cluster_closed_reference(sequences: QIIME1DemuxFormat,
                             reference_database: NinjaOpsDBDirFmt) \
                                     -> (BIOMV100Format, QIIME1DemuxFormat):
    # Input paths supplied to ninja.py.
    sequences_fp = str(sequences)
    reference_database_dir = os.path.join(str(reference_database), 'db')

    # Output directory to store ninja.py results.
    output_dirfmt = model.DirectoryFormat()
    output_dir = str(output_dirfmt)

    cmd = [
        'ninja.py', '--input', sequences_fp, '--database',
        reference_database_dir, '--output', output_dir, '--full_output'
    ]
    run_command(cmd)

    biom_fp = os.path.join(output_dir, 'ninja_otutable.biom')
    output_biom_fmt = BIOMV100Format(biom_fp, mode='r')
    # Keep a reference to the DirectoryFormat this BIOM file resides in so that
    # the directory isn't deleted when `output_dirfmt` goes out of scope upon
    # function exit. The directory will be cleaned up appropriately when
    # `output_biom_fmt` is cleaned up and avoids copying the BIOM file.
    output_biom_fmt.__dirfmt = output_dirfmt

    # Get the set of IDs that failed to hit the reference database.
    failed_ids = set()
    failed_ids_fp = os.path.join(output_dir, 'ninja_fail.txt')
    with open(failed_ids_fp, 'r') as fh:
        for line in fh:
            id = line.rstrip('\n')
            failed_ids.add(id)

    # Filter the input sequences to only those that failed to hit the reference
    # database.
    output_failures_fmt = QIIME1DemuxFormat()
    with output_failures_fmt.open() as fh:
        for seq in skbio.io.read(sequences_fp, format='fasta'):
            id = seq.metadata['id']
            if id in failed_ids:
                # Turning off roundtripping options to speed up writing. We can
                # safely turn these options off because we know the sequence
                # IDs are rountrip-safe since we're reading them from a FASTA
                # file.
                #
                # http://scikit-bio.org/docs/latest/generated/
                #     skbio.io.format.fasta.html#writer-specific-parameters
                seq.write(fh,
                          id_whitespace_replacement=None,
                          description_newline_replacement=None)

    return output_biom_fmt, output_failures_fmt
예제 #2
0
def _2(data: Pipeline) -> TaxonomicClassiferTemporaryPickleDirFmt:
    sklearn_pipeline = PickleFormat()
    with tarfile.open(str(sklearn_pipeline), 'w') as tar:
        tmpdir = model.DirectoryFormat()
        pf = os.path.join(str(tmpdir), 'sklearn_pipeline.pkl')
        for fn in joblib.dump(data, pf):
            tar.add(fn, os.path.basename(fn))
            os.unlink(fn)

    dirfmt = TaxonomicClassiferTemporaryPickleDirFmt()
    dirfmt.version_info.write_data({'sklearn-version': sklearn.__version__},
                                   dict)
    dirfmt.sklearn_pipeline.write_data(sklearn_pipeline, PickleFormat)

    return dirfmt
예제 #3
0
def _1(dirfmt: TaxonomicClassiferTemporaryPickleDirFmt) -> Pipeline:
    sklearn_version = dirfmt.version_info.view(dict)['sklearn-version']
    if sklearn_version != sklearn.__version__:
        raise ValueError('The scikit-learn version (%s) used to generate this'
                         ' artifact does not match the current version'
                         ' of scikit-learn installed (%s). Please retrain your'
                         ' classifier for your current deployment to prevent'
                         ' data-corruption errors.' %
                         (sklearn_version, sklearn.__version__))

    sklearn_pipeline = dirfmt.sklearn_pipeline.view(PickleFormat)

    with tarfile.open(str(sklearn_pipeline)) as tar:
        tmpdir = model.DirectoryFormat()
        dirname = str(tmpdir)
        tar.extractall(dirname)
        pipeline = joblib.load(os.path.join(dirname, 'sklearn_pipeline.pkl'))
        for fn in tar.getnames():
            os.unlink(os.path.join(dirname, fn))

    return pipeline