def test_tsv_builder(self): seqs = DNAIterator(skbio.DNA(a, metadata=b)for a, b in ( ('A', {'id': 'seq01'}), ('AA', {'id': 'seq02'}), ('AAA', {'id': 'seq03'}), ('AAAA', {'id': 'seq04'}), ('AAAA', {'id': 'seq05'}), ('AAA', {'id': 'seq06'}), ('AA', {'id': 'seq07'}), ('AAAAAAAAAA', {'id': 'seq08'}))) # Do the files exist? with tempfile.TemporaryDirectory() as output_dir: tabulate_seqs(output_dir, seqs) expected_stats_fp = os.path.join( output_dir, 'descriptive_stats.tsv') expected_summary_fp = os.path.join( output_dir, 'seven_number_summary.tsv') self.assertTrue(os.path.exists(expected_stats_fp)) self.assertTrue(os.path.exists(expected_summary_fp)) # Was data written to the files? with open(expected_stats_fp) as stats_tsv: tsv_reader = csv.reader(stats_tsv, dialect="excel-tab") tsv_text = [] for row in tsv_reader: tsv_text.append(row) self.assertEqual(['Statistic', 'Value'], tsv_text[0]) self.assertEqual(['count', '8'], tsv_text[1]) with open(expected_summary_fp) as summ_tsv: tsv_reader = csv.reader(summ_tsv, dialect="excel-tab") tsv_text = [] for row in tsv_reader: tsv_text.append(row) self.assertEqual(['Quantile', 'Value'], tsv_text[0]) self.assertEqual(['0.02', '1.14'], tsv_text[1]) # Does link html generate correctly? expected_index_fp = os.path.join(output_dir, 'index.html') with open(expected_index_fp) as fh: self.assertTrue('href="descriptive_stats.tsv"' in fh.read()) with open(expected_index_fp) as fh: self.assertTrue( 'href="seven_number_summary.tsv"' in fh.read())
def test_basic(self): seqs = DNAIterator( skbio.DNA(a, metadata=b) for a, b in (('ACGT', { 'id': 'seq1' }), ('AAAA', { 'id': 'seq2' }))) with tempfile.TemporaryDirectory() as output_dir: tabulate_seqs(output_dir, seqs) expected_fp = os.path.join(output_dir, 'index.html') self.assertTrue(os.path.exists(expected_fp)) with open(expected_fp) as fh: file_text = fh.read() self.assertTrue('ACGT</a>' in file_text) self.assertTrue('<td>4</td>' in file_text) self.assertTrue('<td>seq2</td>' in file_text)
def cross_validate_classifier(ref_taxa, ref_seqs, classifier_spec, obs_dir, results_dir, intermediate_dir, n_jobs, log_file, log_level, confidence, classifier_directory): classifier_spec = classifier_spec.read() # set up logging setup_logging(log_level, log_file) logging.info(locals()) # load folds taxon_defaults_file = join(intermediate_dir, 'taxon_defaults.json') with open(taxon_defaults_file) as fh: taxon_defaults = json.load(fh) folds = glob.glob(join(intermediate_dir, 'fold-*')) logging.info('Got folds') # load ref_seq _, ref_seqs = load_references(ref_taxa, ref_seqs) ref_seqs = Artifact.import_data('FeatureData[Sequence]', DNAIterator(ref_seqs)) # for each fold for fold in folds: # load new file for different folds weights_file = join(fold, 'weights.qza') training_taxa_file = join(fold, 'train_taxa.qza') # load the simulated test samples test_samples = load_simulated_samples(fold, results_dir) # load the test seqs, training taxa, traing seqs, and weights weights = Artifact.load(weights_file) #test_seqs = Artifact.load(test_seqs_file) train_taxa = Artifact.load(training_taxa_file) # train the weighted classifier and classify the test samples classification = classify_samples_sklearn(test_samples, train_taxa, ref_seqs, classifier_spec, confidence, n_jobs, weights) # save the classified taxonomy artifacts save_observed(classifier_directory, test_samples, classification, obs_dir) logging.info('Done ' + fold)
def extract_reads(sequences: DNAIterator, f_primer: str, r_primer: str, trunc_len: int = 0, trim_left: int = 0, identity: float = 0.8, min_length: int = 50, max_length: int = 0) -> DNAIterator: """Extract the read selected by a primer or primer pair. Only sequences which match the primers at greater than the specified identity are returned Parameters ---------- sequences : DNAIterator an aligned list of skbio.sequence.DNA query sequences f_primer : skbio.sequence.DNA forward primer sequence r_primer : skbio.sequence.DNA reverse primer sequence trunc_len : int, optional read is cut to trunc_len if trunc_len is positive. Applied before trim_left. trim_left : int, optional trim_left nucleotides are removed from the 5' end if trim_left is positive. Applied after trunc_len. identity : float, optional minimum combined primer match identity threshold. Default: 0.8 min_length: int, optional Minimum amplicon length. Shorter amplicons are discarded. Default: 50 max_length: int, optional Maximum amplicon length. Longer amplicons are discarded. Returns ------- q2_types.DNAIterator containing the reads """ reads = _gen_reads(sequences, f_primer, r_primer, trunc_len, trim_left, identity, min_length, max_length) try: first_read = next(reads) except StopIteration: raise RuntimeError('No matches found') from None return DNAIterator(chain([first_read], reads))
def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline, reads_per_batch: int = 0, n_jobs: int = 1, pre_dispatch: str = '2*n_jobs', confidence: float = 0.7, read_orientation: str = 'auto') -> pd.DataFrame: try: # autotune reads per batch if reads_per_batch == 0: reads_per_batch = _autotune_reads_per_batch(reads, n_jobs) # transform reads to DNAIterator reads = DNAIterator( skbio.read(str(reads), format='fasta', constructor=skbio.DNA)) reads = _autodetect_orientation(reads, classifier, read_orientation=read_orientation) predictions = predict(reads, classifier, chunk_size=reads_per_batch, n_jobs=n_jobs, pre_dispatch=pre_dispatch, confidence=confidence) seq_ids, taxonomy, confidence = list(zip(*predictions)) result = pd.DataFrame({ 'Taxon': taxonomy, 'Confidence': confidence }, index=seq_ids, columns=['Taxon', 'Confidence']) result.index.name = 'Feature ID' return result except MemoryError: raise MemoryError("The operation has run out of available memory. " "To correct this error:\n" "1. Reduce the reads per batch\n" "2. Reduce number of n_jobs being performed\n" "3. Use a more powerful machine or allocate " "more resources ")
def test_descriptive_stats_integration(self): seqs = DNAIterator( skbio.DNA(a, metadata=b) for a, b in (('A', { 'id': 'seq01' }), ('AA', { 'id': 'seq02' }), ('AAA', { 'id': 'seq03' }), ('AAAA', { 'id': 'seq04' }), ('AAAA', { 'id': 'seq05' }), ('AAA', { 'id': 'seq06' }), ('AA', { 'id': 'seq07' }), ('AAAAAAAAAA', { 'id': 'seq08' }))) with tempfile.TemporaryDirectory() as output_dir: tabulate_seqs(output_dir, seqs) expected_fp = os.path.join(output_dir, 'index.html') # all expected values are unique. If they all render in index.html, our # function likely worked as expected. with open(expected_fp) as fh: file_text = fh.read() self.assertTrue('<td>8</td>' in file_text) self.assertTrue('<td>1</td>' in file_text) self.assertTrue('<td>10</td>' in file_text) self.assertTrue('<td>3.62</td>' in file_text) self.assertTrue('<td>9</td>' in file_text) self.assertTrue('<td>1</td>' in file_text) self.assertTrue('<td>1</td>' in file_text) self.assertTrue('<td>2</td>' in file_text) self.assertTrue('<td>3</td>' in file_text) self.assertTrue('<td>4</td>' in file_text) self.assertTrue('<td>6</td>' in file_text) self.assertTrue('<td>9</td>' in file_text)
def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline, reads_per_batch: int = 0, n_jobs: int = 1, pre_dispatch: str = '2*n_jobs', confidence: float = 0.7, read_orientation: str = None ) -> pd.DataFrame: # autotune reads per batch if reads_per_batch == 0: reads_per_batch = _autotune_reads_per_batch(reads, n_jobs) # transform reads to DNAIterator reads = DNAIterator( skbio.read(str(reads), format='fasta', constructor=skbio.DNA)) reads = _autodetect_orientation( reads, classifier, read_orientation=read_orientation) predictions = predict(reads, classifier, chunk_size=reads_per_batch, n_jobs=n_jobs, pre_dispatch=pre_dispatch, confidence=confidence) seq_ids, taxonomy, confidence = list(zip(*predictions)) result = pd.DataFrame({'Taxon': taxonomy, 'Confidence': confidence}, index=seq_ids, columns=['Taxon', 'Confidence']) result.index.name = 'Feature ID' return result
def _denoise_helper( demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt, trim_length: int, left_trim_len: int = 0, sample_stats: bool = False, reference_seqs: DNAFASTAFormat = None, mean_error: float = 0.005, indel_prob: float = 0.01, indel_max: int = 3, min_reads: int = 10, min_size: int = 2, jobs_to_start: int = 1, hashed_feature_ids: bool = True) -> (biom.Table, DNAIterator, pd.DataFrame): _check_inputs(**locals()) df = demultiplexed_seqs.manifest.view(pd.DataFrame) ids_with_underscores = df.index.astype(str).str.contains('_') ids_with_underscores = df[ids_with_underscores].index.tolist() if ids_with_underscores: ids_with_underscores = ', '.join(ids_with_underscores) raise ValueError("Deblur cannot operate on sample IDs that " "contain underscores. The following ID(s) " "contain one or more underscores: " f"{ids_with_underscores}.") with tempfile.TemporaryDirectory() as tmp: seqs_fp = str(demultiplexed_seqs) cmd = ['deblur', 'workflow', '--seqs-fp', seqs_fp, '--output-dir', tmp, '--mean-error', str(mean_error), '--indel-prob', str(indel_prob), '--indel-max', str(indel_max), '--trim-length', str(trim_length), '--left-trim-length', str(left_trim_len), '--min-reads', str(min_reads), '--min-size', str(min_size), '--jobs-to-start', str(jobs_to_start), '-w'] if reference_seqs is not None: cmd.append('--pos-ref-fp') cmd.append(str(reference_seqs)) if sample_stats: cmd.append('--keep-tmp-files') subprocess.run(cmd, check=True) # this is one of the outputs from Deblur, however it isn't clear what # the utility of it is for the majority of qiime2 users. on the other # hand, it is very easy to test to see if the run completed. all_seqs = os.path.join(tmp, 'all.seqs.fa') if os.stat(all_seqs).st_size == 0: raise ValueError("No sequences passed the filter. It is possible " "the trim_length (%d) may exceed the longest " "sequence, that all of the sequences are " "artifacts like PhiX or adapter, or that the " "positive reference used is not representative " "of the data being denoised." % trim_length) table = _load_table(tmp) if hashed_feature_ids: obs_map = _hash_ids(table) # inplace operation else: obs_map = {i: i for i in table.ids(axis='observation')} rep_sequences = DNAIterator( (skbio.DNA(k, metadata={'id': v}, lowercase='ignore') for k, v in obs_map.items())) if sample_stats: stats = _gather_stats(demultiplexed_seqs, tmp) else: stats = pd.DataFrame([], columns=STATS_HEADER) stats.set_index('sample-id', inplace=True) return (table, rep_sequences, stats)
def _7(data: RNAFASTAFormat) -> DNAIterator: iterator = _read_fasta(str(data), constructor=skbio.RNA) generator = _rna_to_dna_iterator(iterator) return DNAIterator(generator)
def _7(data: RNAFASTAFormat) -> DNAIterator: converted_dna = _rna_to_dna(str(data)) generator = _read_dna_fasta(str(converted_dna)) return DNAIterator(generator)
def sequence_variants_from_samples(samples: biom.Table) -> DNAIterator: seqs = (DNA(s, metadata={'id': s}) for s in samples.ids(axis='observation')) return DNAIterator(seqs)
def _denoise_helper(biom_fp, track_fp, hashed_feature_ids, retain_all_samples): _check_featureless_table(biom_fp) with open(biom_fp) as fh: table = biom.Table.from_tsv(fh, None, None, None) df = pd.read_csv(track_fp, sep='\t', index_col=0) df.index.name = 'sample-id' df = df.rename(index=_filepath_to_sample) PASSED_FILTER = 'percentage of input passed filter' NON_CHIMERIC = 'percentage of input non-chimeric' round_cols = {PASSED_FILTER: 2, NON_CHIMERIC: 2} df[PASSED_FILTER] = df['filtered'] / df['input'] * 100 df[NON_CHIMERIC] = df['non-chimeric'] / df['input'] * 100 col_order = [ 'input', 'filtered', PASSED_FILTER, 'denoised', 'non-chimeric', NON_CHIMERIC ] # only calculate percentage of input merged if paired end if 'merged' in df: MERGED = 'percentage of input merged' round_cols[MERGED] = 2 df[MERGED] = df['merged'] / df['input'] * 100 col_order.insert(4, 'merged') col_order.insert(5, MERGED) df = df[col_order] df.fillna(0, inplace=True) df = df.round(round_cols) metadata = qiime2.Metadata(df) # Currently the sample IDs in DADA2 are the file names. We make # them the sample id part of the filename here. sid_map = { id_: _filepath_to_sample(id_) for id_ in table.ids(axis='sample') } table.update_ids(sid_map, axis='sample', inplace=True) # Reintroduce empty samples dropped by dada2. table_cols = table.ids(axis='observation') table_rows = list(set(df.index) - set(table.ids())) table_to_add = biom.Table(np.zeros((len(table_cols), len(table_rows))), table_cols, table_rows, type="OTU table") table = table.concat(table_to_add) # This is necissary (instead of just not reintroducing above) # dada2 will discard samples which are empty after filtering # but will keep samples that are empty after merging # so there are potentially samples removed here that were not # reintroduced above! if not retain_all_samples: table = table.remove_empty(axis="sample", inplace=False) # The feature IDs in DADA2 are the sequences themselves. if hashed_feature_ids: # Make feature IDs the md5 sums of the sequences. fid_map = { id_: hashlib.md5(id_.encode('utf-8')).hexdigest() for id_ in table.ids(axis='observation') } table.update_ids(fid_map, axis='observation', inplace=True) rep_sequences = DNAIterator( (skbio.DNA(k, metadata={'id': v}) for k, v in fid_map.items())) else: rep_sequences = DNAIterator((skbio.DNA(id_, metadata={'id': id_}) for id_ in table.ids(axis='observation'))) return table, rep_sequences, metadata
def reverse_transcribe(rna_sequences: RNAIterator) -> DNAIterator: generator = _rna_to_dna_iterator(rna_sequences) return DNAIterator(generator)
def _denoise_helper(biom_fp, track_fp, hashed_feature_ids): _check_featureless_table(biom_fp) with open(biom_fp) as fh: table = biom.Table.from_tsv(fh, None, None, None) df = pd.read_csv(track_fp, sep='\t', index_col=0) df.index.name = 'sample-id' df = df.rename(index=_filepath_to_sample) PASSED_FILTER = 'percentage of input passed filter' NON_CHIMERIC = 'percentage of input non-chimeric' round_cols = {PASSED_FILTER: 2, NON_CHIMERIC: 2} df[PASSED_FILTER] = df['filtered'] / df['input'] * 100 df[NON_CHIMERIC] = df['non-chimeric'] / df['input'] * 100 col_order = [ 'input', 'filtered', PASSED_FILTER, 'denoised', 'non-chimeric', NON_CHIMERIC ] # only calculate percentage of input merged if paired end if 'merged' in df: MERGED = 'percentage of input merged' round_cols[MERGED] = 2 df[MERGED] = df['merged'] / df['input'] * 100 col_order.insert(4, 'merged') col_order.insert(5, MERGED) # only calculate percentage of input primer-removed if ccs if 'primer-removed' in df: PASSED_PRIMERREMOVE = 'percentage of input primer-removed' round_cols[PASSED_PRIMERREMOVE] = 2 df[PASSED_PRIMERREMOVE] = df['primer-removed'] / df['input'] * 100 col_order.insert(1, 'primer-removed') col_order.insert(2, PASSED_PRIMERREMOVE) df = df[col_order] df.fillna(0, inplace=True) df = df.round(round_cols) metadata = qiime2.Metadata(df) # Currently the sample IDs in DADA2 are the file names. We make # them the sample id part of the filename here. sid_map = { id_: _filepath_to_sample(id_) for id_ in table.ids(axis='sample') } table.update_ids(sid_map, axis='sample', inplace=True) # The feature IDs in DADA2 are the sequences themselves. if hashed_feature_ids: # Make feature IDs the md5 sums of the sequences. fid_map = { id_: hashlib.md5(id_.encode('utf-8')).hexdigest() for id_ in table.ids(axis='observation') } table.update_ids(fid_map, axis='observation', inplace=True) rep_sequences = DNAIterator( (skbio.DNA(k, metadata={'id': v}) for k, v in fid_map.items())) else: rep_sequences = DNAIterator((skbio.DNA(id_, metadata={'id': id_}) for id_ in table.ids(axis='observation'))) return table, rep_sequences, metadata
def get_train_artifacts(taxonomy_samples, fold, taxon_defaults, ref_taxa, ref_seqs, weights=None): if weights is None: with open(join(fold, 'sample_train.json')) as fp: train_samples = json.load(fp) train_samples = extract_sample(train_samples, taxonomy_samples) else: train_samples = weights.view(Table) ref_taxa, ref_seqs = load_references(ref_taxa, ref_seqs) with open(join(fold, 'seq_train.json')) as fp: train_seqs = json.load(fp) train_taxa = {ref_taxa[sid] for sid in train_seqs} hits = [0] direct_remaps = [0] indirect_remaps = [0] def collapse(taxon, _): if taxon in train_taxa: hits[0] += 1 return taxon if taxon_defaults[taxon][0] in train_taxa: direct_remaps[0] += 1 return taxon_defaults[taxon][0] for try_taxon in taxon_defaults[taxon][1:]: if try_taxon in train_taxa: indirect_remaps[0] += 1 return try_taxon train_samples = train_samples.collapse(collapse, axis='observation', norm=False) logging.info('Train taxon remaps') logging.info(str(hits[0]) + ' hits') logging.info(str(direct_remaps[0]) + ' direct remaps') logging.info(str(indirect_remaps[0]) + ' indirect remaps') train_samples = Artifact.import_data('FeatureTable[Frequency]', train_samples) train_taxa = list(train_taxa) eye_taxonomy = DataFrame({'Taxon': train_taxa}, index=train_taxa, columns=['Taxon']) eye_taxonomy.index.name = 'Feature ID' eye_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', eye_taxonomy) train_taxa = [ref_taxa[sid] for sid in train_seqs] train_taxonomy = DataFrame({'Taxon': train_taxa}, index=train_seqs, columns=['Taxon']) train_taxonomy.index.name = 'Feature ID' train_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', train_taxonomy) train_iter = DNAIterator(s for s in ref_seqs if s.metadata['id'] in train_seqs) train_art = Artifact.import_data('FeatureData[Sequence]', train_iter) unobserved_weight = 1e-6 if weights is None else 0. weights = clawback.methods.generate_class_weights( train_taxonomy, train_art, train_samples, eye_taxonomy, unobserved_weight=unobserved_weight) ref_seqs = Artifact.import_data('FeatureData[Sequence]', DNAIterator(ref_seqs)) return train_taxonomy, train_art, ref_seqs, weights.class_weight