def _convert_generator_to_delayed_seq_block(generator, chunksize=5000): """ Converts from a generator to a block of sequences """ seq_block = [ dask.delayed(_to_seq_array)(seqs) for seqs in _chunks(generator, chunksize) ] return seq_block
def extract_reads(sequences: DNASequencesDirectoryFormat, f_primer: str, r_primer: str, trunc_len: int = 0, trim_left: int = 0, identity: float = 0.8, min_length: int = 50, max_length: int = 0, n_jobs: int = 1, batch_size: int = 'auto') -> DNAFASTAFormat: """Extract the read selected by a primer or primer pair. Only sequences which match the primers at greater than the specified identity are returned Parameters ---------- sequences : DNASequencesDirectoryFormat An aligned list of skbio.sequence.DNA query sequences f_primer : skbio.sequence.DNA Forward primer sequence r_primer : skbio.sequence.DNA Reverse primer sequence trunc_len : int, optional Read is cut to trunc_len if trunc_len is positive. Applied before trim_left. trim_left : int, optional `trim_left` nucleotides are removed from the 5' end if trim_left is positive. Applied after trunc_len. identity : float, optional Minimum combined primer match identity threshold. Default: 0.8 min_length: int, optional Minimum amplicon length. Shorter amplicons are discarded. Default: 50 max_length: int, optional Maximum amplicon length. Longer amplicons are discarded. n_jobs: int, optional Number of seperate processes to break the task into. batch_size: int, optional Number of samples to be processed in one batch. Returns ------- q2_types.DNAFASTAFormat containing the reads """ if min_length > trunc_len - trim_left and trunc_len > 0: raise ValueError('The minimum length setting is greater than the ' 'length of the truncated sequences. This will cause ' 'all sequences to be removed from the dataset. To ' 'proceed, set a min_length ≤ trunc_len - trim_left.') n_jobs = effective_n_jobs(n_jobs) if batch_size == 'auto': batch_size = _autotune_reads_per_batch( sequences.file.view(DNAFASTAFormat), n_jobs) sequences = sequences.file.view(DNAIterator) ff = DNAFASTAFormat() with open(str(ff), 'a') as fh: with Parallel(n_jobs) as parallel: for chunk in _chunks(sequences, batch_size): amplicons = parallel(delayed(_gen_reads)(sequence, f_primer, r_primer, trunc_len, trim_left, identity, min_length, max_length) for sequence in chunk) for amplicon in amplicons: if amplicon is not None: skbio.write(amplicon, format='fasta', into=fh) if os.stat(str(ff)).st_size == 0: raise RuntimeError("No matches found") return ff
def prepare_extracted_region(sequences: DNAFASTAFormat, region:str, trim_length:int, fwd_primer:str, rev_primer:str, reverse_complement_rev:bool=True, reverse_complement_result:bool=False, chunk_size:int=10000, debug:bool=False, n_workers:int=1, client_address:str=None, ) -> (DNAFASTAFormat, pd.DataFrame): """ Prepares and extracted database for regional alignment This function takes an amplified region of the database, expands the degenerate sequences and collapses the duplciated sequences under a single id that can be untangled later. Parameters ---------- sequences: q2_type.DNAFASTAFormat The regional sequences to be collapsed region: str A unique name for the region being handled trim_length : int The length of final sequences to matched the trimmed kmers for kmer-based alignment. chunk_size: int, optional The number of sequences to group for analysis debug: bool Whether the function should be run in debug mode (without a client) or not. `debug` superceeds all options n_workers: int, optional The number of jobs to initiate. When `n_workers` is 0, the cluster will be able to access all avaliable resources. Returns ------- q2_types.DNAFASTAFormat The reads with degenerate nucleotides expanded and duplicated sequences collapsed. DataFrame A mapping between the kmer sequence name and the the full database sequence name, along with regional information """ # Sets up the client _setup_dask_client(debug=debug, cluster_config=None, n_workers=n_workers, address=client_address) # Reverse complements the reverse primer if reverse_complement_rev: rev_primer = str(DNA(rev_primer).reverse_complement()) # Reads in the sequences sequences = sequences.view(DNAIterator) seq_blocks = [dask.delayed(_block_seqs)(seq) for seq in _chunks(sequences, int((chunk_size)))] # Makes the fake extraction position based on the trim length fragment = [dask.delayed(_artifical_trim)(seq, trim_length) for seq in seq_blocks] # Prepares the amplicon for collapsing condensed = dd.from_delayed([ dask.delayed(_condense_seqs)(seq) for seq in fragment], meta=[('amplicon', 'str'), ('seq-name', 'str')] ) # Writes the ff, group2 = _collapse_all_sequences(condensed, reverse_complement_result) ids = _expand_ids(group2, fwd_primer, rev_primer, region, trim_length, chunk_size) return (ff, ids.compute().set_index('db-seq').sort_index())
def align_regional_kmers(kmers: DNAFASTAFormat, rep_seq: pd.Series, region: str, max_mismatch: int=2, chunk_size:int=100, debug:bool=False, n_workers:int=1, client_address:str=None) -> KmerAlignFormat: """ Performs regional alignment between database "kmers" and ASVs Parameters ---------- kmers : DNAFastaFormat The set of reference sequences extracted from the database. These are assumes to be start in the same position of the 16s rRNA sequence as the sequence being tested and assumed to be the same length as the ASVs being aligned. rep_seq: DNAFastaFormat The representative sequences for the regional ASV table being aligned. These are assumed to start at the same position as the kmers and should be trimmed to the same length. region: str An identifier for the region. Ideally, this matches the identifier used in the reference region map max_mismatch: int the maximum number of mismatched nucleotides allowed in mapping between a sequence and kmer. debug: bool Whether the function should be run in debug mode (without a client) or not. `debug` superceeds all options n_workers: int, optional The number of jobs to initiate. When `n_workers` is 0, the cluster will be able to access all avaliable resources. Returns ------- DataFrame A mapping between the kmer (`kmer`) and the asv (`asv`), including the region (`region`), number of mismatched basepairs (`mismatch`) and the sequence length (`length`). DNAFASTAFormat The ASVs which could not be aligned to kmers """ # Sets up the client _setup_dask_client(debug=debug, cluster_config=None, n_workers=n_workers, address=client_address) # Converts the representative sequences to a delayed object num_asvs, asv_length = _check_read_lengths(rep_seq, 'rep_seq') rep_seq_ids = rep_seq.index.values rep_seq = dd.from_pandas(rep_seq.astype(str), chunksize=chunk_size) ff = KmerAlignFormat() # Performs the alignment for i, batch in enumerate(_chunks(kmers.view(DNAIterator), chunk_size * 100)): if i == 0: batch = pd.Series({s.metadata['id']: str(s) for s in batch}) num_kmers, kmer_length = _check_read_lengths(batch, 'kmer') if kmer_length != asv_length: raise ValueError('The kmer and ASV sequences must be the' ' same length') batch = dd.from_pandas(batch, chunksize=chunk_size) else: batch = dd.from_pandas( pd.Series({s.metadata['id']: str(s) for s in batch}), chunksize=chunk_size ) aligned_batch = np.hstack([ dask.delayed(_align_kmers)(kmer, asv, max_mismatch) for kmer, asv in it.product(batch.to_delayed(), rep_seq.to_delayed()) ]) aligned_batch = pd.concat(axis=0, objs=dask.compute(*aligned_batch)) aligned_batch['region'] = region aligned_batch['max-mismatch'] = max_mismatch if i == 0: aligned_batch.to_csv(str(ff), sep='\t', index=False, mode='w') else: aligned_batch.to_csv(str(ff), sep='\t', index=False, header=False, mode='a') return ff