def cluster_closed_reference(sequences: QIIME1DemuxFormat, reference_database: NinjaOpsDBDirFmt) \ -> (BIOMV100Format, QIIME1DemuxFormat): # Input paths supplied to ninja.py. sequences_fp = str(sequences) reference_database_dir = os.path.join(str(reference_database), 'db') # Output directory to store ninja.py results. output_dirfmt = model.DirectoryFormat() output_dir = str(output_dirfmt) cmd = [ 'ninja.py', '--input', sequences_fp, '--database', reference_database_dir, '--output', output_dir, '--full_output' ] run_command(cmd) biom_fp = os.path.join(output_dir, 'ninja_otutable.biom') output_biom_fmt = BIOMV100Format(biom_fp, mode='r') # Keep a reference to the DirectoryFormat this BIOM file resides in so that # the directory isn't deleted when `output_dirfmt` goes out of scope upon # function exit. The directory will be cleaned up appropriately when # `output_biom_fmt` is cleaned up and avoids copying the BIOM file. output_biom_fmt.__dirfmt = output_dirfmt # Get the set of IDs that failed to hit the reference database. failed_ids = set() failed_ids_fp = os.path.join(output_dir, 'ninja_fail.txt') with open(failed_ids_fp, 'r') as fh: for line in fh: id = line.rstrip('\n') failed_ids.add(id) # Filter the input sequences to only those that failed to hit the reference # database. output_failures_fmt = QIIME1DemuxFormat() with output_failures_fmt.open() as fh: for seq in skbio.io.read(sequences_fp, format='fasta'): id = seq.metadata['id'] if id in failed_ids: # Turning off roundtripping options to speed up writing. We can # safely turn these options off because we know the sequence # IDs are rountrip-safe since we're reading them from a FASTA # file. # # http://scikit-bio.org/docs/latest/generated/ # skbio.io.format.fasta.html#writer-specific-parameters seq.write(fh, id_whitespace_replacement=None, description_newline_replacement=None) return output_biom_fmt, output_failures_fmt
def _2(data: Pipeline) -> TaxonomicClassiferTemporaryPickleDirFmt: sklearn_pipeline = PickleFormat() with tarfile.open(str(sklearn_pipeline), 'w') as tar: tmpdir = model.DirectoryFormat() pf = os.path.join(str(tmpdir), 'sklearn_pipeline.pkl') for fn in joblib.dump(data, pf): tar.add(fn, os.path.basename(fn)) os.unlink(fn) dirfmt = TaxonomicClassiferTemporaryPickleDirFmt() dirfmt.version_info.write_data({'sklearn-version': sklearn.__version__}, dict) dirfmt.sklearn_pipeline.write_data(sklearn_pipeline, PickleFormat) return dirfmt
def _1(dirfmt: TaxonomicClassiferTemporaryPickleDirFmt) -> Pipeline: sklearn_version = dirfmt.version_info.view(dict)['sklearn-version'] if sklearn_version != sklearn.__version__: raise ValueError('The scikit-learn version (%s) used to generate this' ' artifact does not match the current version' ' of scikit-learn installed (%s). Please retrain your' ' classifier for your current deployment to prevent' ' data-corruption errors.' % (sklearn_version, sklearn.__version__)) sklearn_pipeline = dirfmt.sklearn_pipeline.view(PickleFormat) with tarfile.open(str(sklearn_pipeline)) as tar: tmpdir = model.DirectoryFormat() dirname = str(tmpdir) tar.extractall(dirname) pipeline = joblib.load(os.path.join(dirname, 'sklearn_pipeline.pkl')) for fn in tar.getnames(): os.unlink(os.path.join(dirname, fn)) return pipeline