Exemplo n.º 1
0
def get_kraken_version():
    """
    Determine Kraken version. If kraken2 exists, it will be the default.

    Returns
    -------
    version_kraken : int or None
        1 if only first Kraken (or zero version) was found, 2 if kraken2 was found, or None if none of those was found
    """
    version_kraken = None

    command = ['which', 'kraken']
    run_successfully, _, _ = utils_run_command(command=command,
                                               shell_True=False,
                                               timeout_sec_None=None,
                                               print_comand_True=False)
    if run_successfully:
        version_kraken = 1

    command[1] = 'kraken2'
    run_successfully, _, _ = utils_run_command(command=command,
                                               shell_True=False,
                                               timeout_sec_None=None,
                                               print_comand_True=False)
    if run_successfully:
        version_kraken = 2

    return version_kraken
Exemplo n.º 2
0
def get_statistics_samtools(alignment, outdir):
    """
    Run Samtools stats to get several statistics from the alignment file

    Parameters
    ----------
    alignment : str
        Path to the alignment file (can be SAM, BAM or CRAM)
    outdir : str
        Path to the output directory

    Returns
    -------
    run_successfully : bool
        Boolean stating if INNUca Assembly_Mapping module ran successfully or not
    samtools_stats : str or None
        If everything went fine, it returns the path to the samtools stats file, otherwise it returns None
    """

    samtools_stats = os.path.join(outdir, 'samtools_stats.txt')

    command = ['samtools', 'stats', alignment, '>', samtools_stats]

    run_successfully, _, _ = utils_run_command(command=command,
                                               shell_True=True,
                                               timeout_sec_None=None,
                                               print_comand_True=True)
    print('')

    if not run_successfully:
        samtools_stats = None

    return run_successfully, samtools_stats
Exemplo n.º 3
0
def run_kraken_report(kraken_db, kraken_output, outdir):
    """
    Get the Kraken report from kraken run

    Parameters
    ----------
    kraken_db : str
        Kraken DB name or path to the directory containing the Kraken DB
    kraken_output : str
        Path to Kraken output file
    outdir : str
        Path to the output directory

    Returns
    -------
    run_successfully : bool
        Boolean stating if Kraken ran successfully or not
    kraken_results : str
        String with Kraken report
    """

    command = ['kraken-report', '--db', kraken_db, kraken_output]
    run_successfully, kraken_results, _ = utils_run_command(
        command=command,
        shell_True=False,
        timeout_sec_None=None,
        print_comand_True=True)
    if run_successfully:
        with open(
                os.path.join(
                    outdir, 'kraken_report.{db}.txt'.format(
                        db=os.path.basename(kraken_db))), 'wt') as writer:
            writer.write(kraken_results)

    return run_successfully, kraken_results
Exemplo n.º 4
0
def mapping_bowtie2(fastq, reference_index, outdir, threads=1):
    """
    Map reads against a reference fasta file

    Parameters
    ----------
    fastq : list
        List of fastq files (only two, paired-end reads)
    reference_index : str
        Path to the reference Bowtie2 index
    outdir : str
        Path to the output directory
    threads : int, default 1
        Number of threads to be used

    Returns
    -------
    run_successfully : bool
        Boolean stating if INNUca Assembly_Mapping module ran successfully or not
    sam : str or None
        If everything went fine, it returns the path to the sam file, otherwise it returns None
    """

    sam = os.path.join(outdir, str('alignment.sam'))

    command = [
        'bowtie2', '-q', '--very-fast', '--threads',
        str(threads), '-x', reference_index, '-1', fastq[0], '-2', fastq[1],
        '--fr', '-I', '0', '-X', '2000', '--no-discordant', '--no-mixed',
        '--no-unal', '-S', sam
    ]

    run_successfully, _, _ = utils_run_command(command=command,
                                               shell_True=False,
                                               timeout_sec_None=None,
                                               print_comand_True=True)
    print('')

    if not run_successfully:
        sam = None

    return run_successfully, sam
Exemplo n.º 5
0
def index_sequence_bowtie2(reference, outdir, threads=1):
    """
    Index reference sequence for Bowtie2

    Parameters
    ----------
    reference : str
        Path to the reference fasta file against which the reads will be mapped
    outdir : str
        Path to the output directory
    threads : int, default 1
        Number of threads to be used

    Returns
    -------
    run_successfully : bool
        Boolean stating if bowtie2-build ran successfully or not
    reference_index : str or None
        Path to the reference Bowtie2 index (if ran successfully, else returns None)
    """

    command = [
        'bowtie2-build', '--threads',
        str(threads), reference,
        os.path.join(outdir, os.path.basename(reference))
    ]
    run_successfully, _, _ = utils_run_command(command=command,
                                               shell_True=False,
                                               timeout_sec_None=None,
                                               print_comand_True=True)
    print('')

    reference_index = None
    if run_successfully:
        reference_index = os.path.join(outdir, os.path.basename(reference))

    return run_successfully, reference_index
Exemplo n.º 6
0
def run_kraken_main(files_to_classify,
                    kraken_db,
                    files_type,
                    outdir,
                    version_kraken,
                    db_mem=False,
                    quick=False,
                    min_base_quality=10,
                    threads=1):
    """
    Run Kraken for data classification

    Parameters
    ----------
    files_to_classify : list
        List with files to be classified by Kraken. Can be one fasta or up to two fastq files
    kraken_db : str
        Kraken DB name or path to the directory containing the Kraken DB
    files_type : str
        Type of the files to be classified: fasta or fastq
    outdir : str
        Path to the output directory
    version_kraken : int or None
        1 if only first Kraken (or zero version) was found, 2 if kraken2 was found, or None if none of those was found
    db_mem : bool, default False
        True if want to load the Kraken DB into memory before run, else False
    quick : bool, default False
        True if want to do a quick operation and only use the first hits
    min_base_quality : int, default 10
        Minimum base quality used in classification. Only used with fastq files and kraken2.
    threads : int, default 1
        Number of threads to be used

    Returns
    -------
    run_successfully : bool
        Boolean stating if Kraken ran successfully or not
    kraken_output : str or None
        Path to Kraken output file. If running kraken2, None is returned
    kraken_report : str or None
        Path to Kraken report (results) file in case of kraken2, else None
    """
    files_type_options = ['fastq', 'fasta']
    if files_type not in files_type_options:
        raise ValueError("Invalid files type. Expected one of: %s" %
                         files_type_options)

    kraken_output = os.path.join(
        outdir, 'kraken.{db}.out'.format(db=os.path.basename(kraken_db)))
    kraken_report = None

    command = [
        'kraken', '', '', '--db', kraken_db, '--threads',
        str(threads), '--output', kraken_output, '',
        '--{type}-input'.format(type=files_type), '', '', '', '', '', '', '',
        ' '.join(files_to_classify)
    ]

    if version_kraken == 2:
        command[0] = 'kraken2'
        command[8] = '-'
        command[10] = ''
        kraken_output = None
        kraken_report = os.path.join(
            outdir,
            'kraken_report.{db}.txt'.format(db=os.path.basename(kraken_db)))
        command[11] = '--report'
        command[12] = kraken_report
        """
        Didn't get what this confidence mean
          --confidence FLOAT      Confidence score threshold (default: 0.0); must be
                                  in [0, 1].

        At present, we have not yet developed a confidence score with a probabilistic interpretation for Kraken 2.
         However, we have developed a simple scoring scheme that has yielded good results for us, and we've made that
          available in Kraken 2 through use of the --confidence option to kraken2. The approach we use allows a user to
           specify a threshold score in the [0,1] interval; the classifier then will adjust labels up the tree until the
            label's score (described below) meets or exceeds that threshold. If a label at the root of the taxonomic
             tree would not have a score exceeding the threshold, the sequence is called unclassified by Kraken 2 when
              this threshold is applied.

        A sequence label's score is a fraction C/Q, where C is the number of k-mers mapped to LCA values in the clade
         rooted at the label, and Q is the number of k-mers in the sequence that lack an ambiguous nucleotide (i.e.,
          they were queried against the database). Consider the example of the LCA mappings in Kraken 2's output given
           earlier:

        "562:13 561:4 A:31 0:1 562:3" would indicate that:
        
        the first 13 k-mers mapped to taxonomy ID #562
        the next 4 k-mers mapped to taxonomy ID #561
        the next 31 k-mers contained an ambiguous nucleotide
        the next k-mer was not in the database
        the last 3 k-mers mapped to taxonomy ID #562

        In this case, ID #561 is the parent node of #562. Here, a label of #562 for this sequence would have a score of
         C/Q = (13+3)/(13+4+1+3) = 16/21. A label of #561 would have a score of C/Q = (13+4+3)/(13+4+1+3) = 20/21. If a
          user specified a --confidence threshold over 16/21, the classifier would adjust the original label from #562
           to #561; if the threshold was greater than 20/21, the sequence would become unclassified.
        """
        # command[13] = '--confidence'
        # command[14] = '1'
        if files_type == 'fastq':
            command[15] = '--minimum-base-quality'
            command[16] = str(min_base_quality)

    if len(files_to_classify) == 0:
        sys.exit('No files provided for classification.')
    elif len(files_to_classify) <= 2:
        if files_type == 'fastq' and len(files_to_classify) == 2:
            command[17] = '--paired'
        elif files_type == 'fasta':
            if len(files_to_classify) == 2:
                sys.exit(
                    '{n} files provided for classification. Maximum of 1 file for fasta is'
                    ' allowed.'.format(n=len(files_to_classify)))
    elif len(files_to_classify) > 2:
        sys.exit(
            '{n} files provided for classification. Maximum of 2 files for fastq or 1 file for fasta are'
            ' allowed.'.format(n=len(files_to_classify)))

    compression_type = kraken_compression_type(files_to_classify[0])
    if compression_type is not None:
        command[9] = '--{type}-compressed'.format(type=compression_type)
    del compression_type

    if quick:
        command[1] = '--quick'

    if db_mem and version_kraken == 1:
        command[2] = '--preload'
    elif not db_mem and version_kraken == 2:
        command[2] = '--memory-mapping'

    run_successfully, _, _ = utils_run_command(command=command,
                                               shell_True=False,
                                               timeout_sec_None=None,
                                               print_comand_True=True)

    return run_successfully, kraken_output, kraken_report