示例#1
0
文件: _utils.py 项目: ditag/q2-sidle
def _convert_generator_to_delayed_seq_block(generator, chunksize=5000):
    """
    Converts from a generator to a block of sequences
    """
    seq_block = [
        dask.delayed(_to_seq_array)(seqs)
        for seqs in _chunks(generator, chunksize)
    ]

    return seq_block
示例#2
0
def extract_reads(sequences: DNASequencesDirectoryFormat, f_primer: str,
                  r_primer: str, trunc_len: int = 0, trim_left: int = 0,
                  identity: float = 0.8, min_length: int = 50,
                  max_length: int = 0, n_jobs: int = 1,
                  batch_size: int = 'auto') -> DNAFASTAFormat:
    """Extract the read selected by a primer or primer pair. Only sequences
    which match the primers at greater than the specified identity are returned

    Parameters
    ----------
    sequences : DNASequencesDirectoryFormat
        An aligned list of skbio.sequence.DNA query sequences
    f_primer : skbio.sequence.DNA
        Forward primer sequence
    r_primer : skbio.sequence.DNA
        Reverse primer sequence
    trunc_len : int, optional
        Read is cut to trunc_len if trunc_len is positive. Applied before
        trim_left.
    trim_left : int, optional
        `trim_left` nucleotides are removed from the 5' end if trim_left is
        positive. Applied after trunc_len.
    identity : float, optional
        Minimum combined primer match identity threshold. Default: 0.8
    min_length: int, optional
        Minimum amplicon length. Shorter amplicons are discarded. Default: 50
    max_length: int, optional
        Maximum amplicon length. Longer amplicons are discarded.
    n_jobs: int, optional
        Number of seperate processes to break the task into.
    batch_size: int, optional
        Number of samples to be processed in one batch.
    Returns
    -------
    q2_types.DNAFASTAFormat
        containing the reads
    """
    if min_length > trunc_len - trim_left and trunc_len > 0:
        raise ValueError('The minimum length setting is greater than the '
                         'length of the truncated sequences. This will cause '
                         'all sequences to be removed from the dataset. To '
                         'proceed, set a min_length ≤ trunc_len - trim_left.')
    n_jobs = effective_n_jobs(n_jobs)
    if batch_size == 'auto':
        batch_size = _autotune_reads_per_batch(
            sequences.file.view(DNAFASTAFormat), n_jobs)
    sequences = sequences.file.view(DNAIterator)
    ff = DNAFASTAFormat()
    with open(str(ff), 'a') as fh:
        with Parallel(n_jobs) as parallel:
            for chunk in _chunks(sequences, batch_size):
                amplicons = parallel(delayed(_gen_reads)(sequence, f_primer,
                                                         r_primer, trunc_len,
                                                         trim_left, identity,
                                                         min_length,
                                                         max_length)
                                     for sequence in chunk)
                for amplicon in amplicons:
                    if amplicon is not None:
                        skbio.write(amplicon, format='fasta', into=fh)
    if os.stat(str(ff)).st_size == 0:
        raise RuntimeError("No matches found")
    return ff
示例#3
0
def prepare_extracted_region(sequences: DNAFASTAFormat, 
    region:str, 
    trim_length:int, 
    fwd_primer:str, 
    rev_primer:str, 
    reverse_complement_rev:bool=True,
    reverse_complement_result:bool=False,
    chunk_size:int=10000, 
    debug:bool=False, 
    n_workers:int=1,
    client_address:str=None,
    ) -> (DNAFASTAFormat, pd.DataFrame):
    """
    Prepares and extracted database for regional alignment

    This function takes an amplified region of the database, expands the
    degenerate sequences and collapses the duplciated sequences under a 
    single id that can be untangled later.

    Parameters
    ----------
    sequences: q2_type.DNAFASTAFormat
        The regional sequences to be collapsed
    region: str
        A unique name for the region being handled
    trim_length : int
        The length of final sequences to matched the trimmed kmers for 
        kmer-based alignment.
    chunk_size: int, optional
        The number of sequences to group for analysis
    debug: bool
        Whether the function should be run in debug mode (without a client)
        or not. `debug` superceeds all options
    n_workers: int, optional
        The number of jobs to initiate. When `n_workers` is 0, the cluster 
        will be able to access all avaliable resources.

    Returns
    -------
    q2_types.DNAFASTAFormat
        The reads with degenerate nucleotides expanded and duplicated 
        sequences collapsed.
    DataFrame
        A mapping between the kmer sequence name and the the full database 
        sequence name, along with regional information
    """

    # Sets up the client
    _setup_dask_client(debug=debug, cluster_config=None,  
                       n_workers=n_workers, address=client_address)

    # Reverse complements the reverse primer
    if reverse_complement_rev:
        rev_primer = str(DNA(rev_primer).reverse_complement())

    # Reads in the sequences
    sequences = sequences.view(DNAIterator)
    seq_blocks = [dask.delayed(_block_seqs)(seq)
                  for seq in _chunks(sequences, int((chunk_size)))]
    # Makes the fake extraction position based on the trim length
    fragment = [dask.delayed(_artifical_trim)(seq, trim_length) 
                for seq in seq_blocks]
    # Prepares the amplicon for collapsing
    condensed = dd.from_delayed([
        dask.delayed(_condense_seqs)(seq) for seq in fragment],
        meta=[('amplicon', 'str'), ('seq-name', 'str')]
    )
    # Writes the 
    ff, group2 = _collapse_all_sequences(condensed, reverse_complement_result)
    ids = _expand_ids(group2, fwd_primer, rev_primer, region, trim_length,
                      chunk_size)

    return (ff, ids.compute().set_index('db-seq').sort_index())
示例#4
0
def align_regional_kmers(kmers: DNAFASTAFormat, 
    rep_seq: pd.Series, 
    region: str, 
    max_mismatch: int=2, 
    chunk_size:int=100, 
    debug:bool=False, 
    n_workers:int=1,
    client_address:str=None) -> KmerAlignFormat:
    """
    Performs regional alignment between database "kmers" and ASVs

    Parameters
    ----------
    kmers : DNAFastaFormat
        The set of reference sequences extracted from the database. These are
        assumes to be start in the same position of the 16s rRNA sequence as 
        the sequence being tested and assumed to be the same length as the
        ASVs being aligned.
    rep_seq: DNAFastaFormat
        The representative sequences for the regional ASV table being aligned.
        These are assumed to start at the same position as the kmers and 
        should be trimmed to the same length.
    region: str
        An identifier for the region. Ideally, this matches the identifier 
        used in the reference region map
    max_mismatch: int
        the maximum number of mismatched nucleotides allowed in mapping 
        between a sequence and kmer.
    debug: bool
        Whether the function should be run in debug mode (without a client)
        or not. `debug` superceeds all options
    n_workers: int, optional
        The number of jobs to initiate. When `n_workers` is 0, the cluster 
        will be able to access all avaliable resources.

    Returns
    -------
    DataFrame
        A mapping between the kmer (`kmer`) and the asv (`asv`), including 
        the region (`region`), number of mismatched basepairs (`mismatch`) and 
        the sequence length (`length`).
    DNAFASTAFormat
        The ASVs which could not be aligned to kmers 

    """
     # Sets up the client
    _setup_dask_client(debug=debug, cluster_config=None,  
                       n_workers=n_workers, address=client_address)

    # Converts the representative sequences to a delayed object
    num_asvs, asv_length = _check_read_lengths(rep_seq, 'rep_seq')
    rep_seq_ids  = rep_seq.index.values
    rep_seq = dd.from_pandas(rep_seq.astype(str),
                             chunksize=chunk_size)

    ff = KmerAlignFormat()

    # Performs the alignment
    for i,  batch in enumerate(_chunks(kmers.view(DNAIterator), 
                               chunk_size * 100)):
       

        if i == 0:
            batch = pd.Series({s.metadata['id']: str(s) for s in batch})
            num_kmers, kmer_length = _check_read_lengths(batch, 'kmer')

            if kmer_length != asv_length:
                raise ValueError('The kmer and ASV sequences must be the'
                                 ' same length')
            batch = dd.from_pandas(batch, chunksize=chunk_size)
        else:
            batch = dd.from_pandas(
                pd.Series({s.metadata['id']: str(s) for s in batch}),
                chunksize=chunk_size
                )

        aligned_batch = np.hstack([
            dask.delayed(_align_kmers)(kmer, asv, max_mismatch)
            for kmer, asv in it.product(batch.to_delayed(), rep_seq.to_delayed())
            ])

        aligned_batch = pd.concat(axis=0, objs=dask.compute(*aligned_batch))

        aligned_batch['region'] = region
        aligned_batch['max-mismatch'] = max_mismatch
        if i  == 0:
            aligned_batch.to_csv(str(ff), sep='\t', index=False, 
                                 mode='w')
        else:
            aligned_batch.to_csv(str(ff), sep='\t', index=False, 
                                 header=False,
                                 mode='a')

    return ff