def test_dna_fasta_format_empty_file(self): filepath = os.path.join(self.temp_dir.name, 'empty') with open(filepath, 'w') as fh: fh.write('\n') format = DNAFASTAFormat(filepath, mode='r') format.validate()
def _load_DNAFASTAFormat(reads_fn): reads_fp = self.get_data_path(reads_fn) return DNAFASTAFormat(reads_fp, mode='r')
def setUp(self): super().setUp() input_fp = self.get_data_path('cleanseq-test-1.fasta') self.seqs1 = DNAFASTAFormat(input_fp, mode='r').view(DNAIterator)
def test_dna_fasta_format_no_id(self): filepath = self.get_data_path('dna-sequences-no-id.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, '1.*missing an ID'): format.validate()
def test_dna_fasta_format_duplicate_ids(self): filepath = self.get_data_path('dna-sequences-duplicate-id.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, '3.*duplicate.*1'): format.validate()
def _2(db: Bowtie2IndexDirFmt) -> DNAFASTAFormat: result = DNAFASTAFormat() bowtie2_inspect_cmd = ['bowtie2-inspect', str(db.path / db.get_basename())] run_command(bowtie2_inspect_cmd, stdout=open(str(result), 'w')) return result
def test_dna_fasta_format_validate_positive(self): filepath = self.get_data_path('dna-sequences.fasta') format = DNAFASTAFormat(filepath, mode='r') format.validate()
def test_dna_fasta_format_bom_passes(self): filepath = self.get_data_path('dna-with-bom-passes.fasta') format = DNAFASTAFormat(filepath, mode='r') format.validate()
def cluster_features_closed_reference(sequences: DNAFASTAFormat, table: biom.Table, reference_sequences: DNAFASTAFormat, perc_identity: float, strand: str = 'plus', threads: int = 1 ) -> (biom.Table, DNAFASTAFormat, DNAFASTAFormat): table_ids = set(table.ids(axis='observation')) sequence_ids = {e.metadata['id'] for e in skbio.io.read( str(sequences), constructor=skbio.DNA, format='fasta')} _error_on_nonoverlapping_ids(table_ids, sequence_ids) matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat() with tempfile.NamedTemporaryFile() as fasta_with_sizes, \ tempfile.NamedTemporaryFile() as out_uc, \ tempfile.NamedTemporaryFile() as tmp_unmatched_seqs: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = ['vsearch', '--usearch_global', fasta_with_sizes.name, '--id', str(perc_identity), '--db', str(reference_sequences), '--uc', out_uc.name, '--strand', str(strand), '--qmask', 'none', # ensures no lowercase DNA chars '--notmatched', tmp_unmatched_seqs.name, '--threads', str(threads)] run_command(cmd) out_uc.seek(0) # It is possible for there to be no unmatched sequences --- if that # is the case, skip thie following clean-up. if os.path.getsize(tmp_unmatched_seqs.name) > 0: # We don't really need to sort the matched sequences, this # is just to let us use --xsize, which strips the counts from # the Feature ID. It would be more ideal if --usearch_global, # above let us pass in --xsize, but unfortunately it isn't # supported. cmd = ['vsearch', '--sortbysize', tmp_unmatched_seqs.name, '--xsize', '--output', str(unmatched_seqs)] run_command(cmd) try: conn = _uc_to_sqlite(out_uc) collapse_f = _collapse_f_from_sqlite(conn) _fasta_from_sqlite(conn, str(sequences), str(matched_seqs)) except ValueError: raise VSearchError('No matches were identified to ' 'reference_sequences. This can happen if ' 'sequences are not homologous to ' 'reference_sequences, or if sequences are ' 'not in the same orientation as reference_' 'sequences (i.e., if sequences are reverse ' 'complemented with respect to reference ' 'sequences). Sequence orientation can be ' 'adjusted with the strand parameter.') unmatched_ids = [e.metadata['id'] for e in skbio.io.read(open(str(unmatched_seqs)), constructor=skbio.DNA, format='fasta')] table.filter(ids_to_keep=unmatched_ids, invert=True, axis='observation', inplace=True) table = table.collapse(collapse_f, norm=False, min_group_size=1, axis='observation', include_collapsed_metadata=False) return table, matched_seqs, unmatched_seqs
def _rna_to_dna(iterator): ff = DNAFASTAFormat() generator = _rna_to_dna_iterator(iterator) skbio.io.write(iter(generator), format='fasta', into=str(ff)) return ff
def setUp(self): super().setUp() dna_path = pkg_resources.resource_filename('rescript.tests', 'data/derep-test.fasta') self.dna_seqs = DNAFASTAFormat(dna_path, mode='r').view(DNAIterator)
def _rna_to_dna(path): ff = DNAFASTAFormat() with ff.open() as outfasta: for seq in _read_rna_fasta(path): seq.reverse_transcribe().write(outfasta) return ff
def setUp(self): super().setUp() self.taxonomy_fp = self.get_data_path('taxonomy.tsv') self.taxonomy = pd.Series.from_csv(self.taxonomy_fp, sep='\t') self.reads_fp = self.get_data_path('se-dna-sequences.fasta') self.reads = DNAFASTAFormat(self.reads_fp, mode='r')
def align_regional_kmers(kmers: DNAFASTAFormat, rep_seq: pd.Series, region: str, max_mismatch: int=2, chunk_size:int=100, debug:bool=False, n_workers:int=1, client_address:str=None) -> KmerAlignFormat: """ Performs regional alignment between database "kmers" and ASVs Parameters ---------- kmers : DNAFastaFormat The set of reference sequences extracted from the database. These are assumes to be start in the same position of the 16s rRNA sequence as the sequence being tested and assumed to be the same length as the ASVs being aligned. rep_seq: DNAFastaFormat The representative sequences for the regional ASV table being aligned. These are assumed to start at the same position as the kmers and should be trimmed to the same length. region: str An identifier for the region. Ideally, this matches the identifier used in the reference region map max_mismatch: int the maximum number of mismatched nucleotides allowed in mapping between a sequence and kmer. debug: bool Whether the function should be run in debug mode (without a client) or not. `debug` superceeds all options n_workers: int, optional The number of jobs to initiate. When `n_workers` is 0, the cluster will be able to access all avaliable resources. Returns ------- DataFrame A mapping between the kmer (`kmer`) and the asv (`asv`), including the region (`region`), number of mismatched basepairs (`mismatch`) and the sequence length (`length`). DNAFASTAFormat The ASVs which could not be aligned to kmers """ # Sets up the client _setup_dask_client(debug=debug, cluster_config=None, n_workers=n_workers, address=client_address) # Converts the representative sequences to a delayed object num_asvs, asv_length = _check_read_lengths(rep_seq, 'rep_seq') rep_seq_ids = rep_seq.index.values rep_seq = dd.from_pandas(rep_seq.astype(str), chunksize=chunk_size) ff = KmerAlignFormat() # Performs the alignment for i, batch in enumerate(_chunks(kmers.view(DNAIterator), chunk_size * 100)): if i == 0: batch = pd.Series({s.metadata['id']: str(s) for s in batch}) num_kmers, kmer_length = _check_read_lengths(batch, 'kmer') if kmer_length != asv_length: raise ValueError('The kmer and ASV sequences must be the' ' same length') batch = dd.from_pandas(batch, chunksize=chunk_size) else: batch = dd.from_pandas( pd.Series({s.metadata['id']: str(s) for s in batch}), chunksize=chunk_size ) aligned_batch = np.hstack([ dask.delayed(_align_kmers)(kmer, asv, max_mismatch) for kmer, asv in it.product(batch.to_delayed(), rep_seq.to_delayed()) ]) aligned_batch = pd.concat(axis=0, objs=dask.compute(*aligned_batch)) aligned_batch['region'] = region aligned_batch['max-mismatch'] = max_mismatch if i == 0: aligned_batch.to_csv(str(ff), sep='\t', index=False, mode='w') else: aligned_batch.to_csv(str(ff), sep='\t', index=False, header=False, mode='a') return ff
def prepare_extracted_region(sequences: DNAFASTAFormat, region:str, trim_length:int, fwd_primer:str, rev_primer:str, reverse_complement_rev:bool=True, reverse_complement_result:bool=False, chunk_size:int=10000, debug:bool=False, n_workers:int=1, client_address:str=None, ) -> (DNAFASTAFormat, pd.DataFrame): """ Prepares and extracted database for regional alignment This function takes an amplified region of the database, expands the degenerate sequences and collapses the duplciated sequences under a single id that can be untangled later. Parameters ---------- sequences: q2_type.DNAFASTAFormat The regional sequences to be collapsed region: str A unique name for the region being handled trim_length : int The length of final sequences to matched the trimmed kmers for kmer-based alignment. chunk_size: int, optional The number of sequences to group for analysis debug: bool Whether the function should be run in debug mode (without a client) or not. `debug` superceeds all options n_workers: int, optional The number of jobs to initiate. When `n_workers` is 0, the cluster will be able to access all avaliable resources. Returns ------- q2_types.DNAFASTAFormat The reads with degenerate nucleotides expanded and duplicated sequences collapsed. DataFrame A mapping between the kmer sequence name and the the full database sequence name, along with regional information """ # Sets up the client _setup_dask_client(debug=debug, cluster_config=None, n_workers=n_workers, address=client_address) # Reverse complements the reverse primer if reverse_complement_rev: rev_primer = str(DNA(rev_primer).reverse_complement()) # Reads in the sequences sequences = sequences.view(DNAIterator) seq_blocks = [dask.delayed(_block_seqs)(seq) for seq in _chunks(sequences, int((chunk_size)))] # Makes the fake extraction position based on the trim length fragment = [dask.delayed(_artifical_trim)(seq, trim_length) for seq in seq_blocks] # Prepares the amplicon for collapsing condensed = dd.from_delayed([ dask.delayed(_condense_seqs)(seq) for seq in fragment], meta=[('amplicon', 'str'), ('seq-name', 'str')] ) # Writes the ff, group2 = _collapse_all_sequences(condensed, reverse_complement_result) ids = _expand_ids(group2, fwd_primer, rev_primer, region, trim_length, chunk_size) return (ff, ids.compute().set_index('db-seq').sort_index())
def test_dna_fasta_format_missing_initial_ID(self): filepath = self.get_data_path('dna-sequences-first-line-not-id.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'First line'): format.validate()
def test_dna_fasta_format_validate_negative(self): filepath = self.get_data_path('not-dna-sequences') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'DNAFASTA'): format.validate()
def test_dna_fasta_format_corrupt_characters(self): filepath = self.get_data_path('dna-sequences-corrupt-characters.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'utf-8.*2'): format.validate()
def test_dna_fasta_format_validate_negative(self): filepath = self.get_data_path('not-dna-sequences') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValueError, 'DNAFASTA'): format.validate()
def test_dna_fasta_format_bom_fails(self): filepath = self.get_data_path('dna-with-bom-fails.fasta') format = DNAFASTAFormat(filepath, mode='r') with self.assertRaisesRegex(ValidationError, 'First line'): format.validate()