Exemplo n.º 1
0
def run_bcftools_mpileup(work_dir, ref_fa_file, THREADS, SS_dir, suffix):
    
    '''
    Runs bcftools mpileup    
      -O = output base positions on reads
      -u = generate uncompressed VCF/BCF output
      -f = faidx indexed reference sequence file         
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str THREADS = number of threads available
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping    
    return: ReturnCode, StdOut, StdErr
    output: 'mpileup' + suffix + '.bcf ' file
    ''' 

    print('\nrunning: BCFtools mpileup')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH\
            + ':' + BCFtools_WorkingDir + '" '\
            + '-i ' + BCFtools_image + ' bcftools mpileup '\
            + '-Ou '\
            + '--threads ' + THREADS + ' '\
            + '-f ' + REF_dir + SS_dir + ref_fa_file + ' '\
            + '-o ' + TEMP_dir + work_dir + 'mpileup' + suffix + '.bcf '\
            + TEMP_dir + work_dir + 'marked_duplicates' + suffix + '.bam'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)     

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nbcftools mpileup:\n', StdOut, file=log_file)
    
    return ReturnCode, StdOut, StdErr
Exemplo n.º 2
0
def run_mark_duplicates(work_dir, suffix):
    
    """ 
    Runs picard MarkDuplicatest to marks duplicate reads in the BAM file, 
      which are subsequently ignored by downstream applications. 
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping    
    return: ReturnCode, StdOut, StdErr
    output: marked_dup_metrics' + suffix + '.txt' file
    """     

    print('\nrunning: Picard MarkDuplicates')
    
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + Picard_WorkingDir + '" '\
            + '-i ' + Picard_image + ' MarkDuplicates '\
            + 'I=bwa_mapped' + suffix + '.bam ' \
            + 'O=marked_duplicates' + suffix + '.bam ' \
            + 'M=marked_dup_metrics' + suffix + '.txt'    

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)     

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\npicard MarkDuplicates:\n', StdOut, file=log_file)
    
    return ReturnCode, StdOut, StdErr
Exemplo n.º 3
0
def run_samtools_index(work_dir, THREADS, suffix, bam_file):
    
    '''
    Indexes the reads in the sorted 'aln.bam' file.
      Usage: samtools index [-bc] [-m INT] <in.bam> [out.index]
       -@ INT   Sets the number of threads [none]
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param str THREADS = number of threads available
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping   
    param: str bam_file = name of the input BAM file
    return: ReturnCode, StdOut, StdErr
    output: index files
    '''

    print('\nrunning: Samtools index')
    
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + Samtools_WorkingDir + '" '\
            + '-i ' + Samtools_image + ' samtools index '\
            + '-@ ' + THREADS + ' '\
            + bam_file                      

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)     

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nsamtools index:\n', StdOut, file=log_file)
    
    return ReturnCode, StdOut, StdErr
Exemplo n.º 4
0
def run_samtools_sort(work_dir, THREADS, suffix):

    '''
    Sorts an alignment file.
      Usage: samtools sort [options...] [in.bam]
      -o FILE        Write final output to FILE rather than standard output
      --threads INT  Number of additional threads to use [0]
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str THREADS = number of threads available
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping    
    return: ReturnCode, StdOut, StdErr
    output: 'bwa_mapped' + suffix + '.bam' file, where suffix = '_1', '_2', ...
    '''

    print('\nrunning: Samtools sort')
        
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + Samtools_WorkingDir + '" '\
            + '-i ' + Samtools_image + ' samtools sort '\
            + '--threads ' + THREADS + ' '\
            + '-o bwa_mapped' + suffix + '.bam '\
            + 'bwa_mapped' + suffix + '.sam'                      

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)     

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nsamtools sort:\n', StdOut, file=log_file)
    
    return ReturnCode, StdOut, StdErr
Exemplo n.º 5
0
def run_bcftools_view(THREADS, work_dir, suffix):
    
    '''
    Converts a bcf file, such as 'mpileup.bcf', into a VCF file.
    param str THREADS = number of threads available
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping    
    return: ReturnCode, StdOut, StdErr
    output: 'mpileup' + suffix + '.vcf' file

    ''' 

    print('\nrunning: BCFtools view')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + BCFtools_WorkingDir + '" '\
            + '-i ' + BCFtools_image + ' bcftools view '\
            + '--threads ' + THREADS + ' '\
            + '-o mpileup' + suffix + '.vcf '\
            + 'mpileup' + suffix + '.bcf'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)     

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nbcftools view:\n', StdOut, file=log_file)
    
    return ReturnCode, StdOut, StdErr
Exemplo n.º 6
0
def run_bwa_mem(work_dir, THREADS, SS_dir, ref_fa_file, suffix):
    
    '''
    Mapping of reads to a reference genome.
      Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]
      -t INT     number of threads [1]
      -o FILE    sam file to output results to [stdout]    
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str THREADS = number of threads available
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping    
    return: ReturnCode, StdOut, StdErr
    output: 'bwa_mapped' + suffix + '.sam' file, where suffix = '_1', '_2', ...
    '''

    print('\nrunning: BWA mem')
    
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH\
            + ':' + BWA_WorkingDir + '" '\
            + '-i ' + BWA_image + ' bwa mem '\
            + '-t ' + THREADS + ' '\
            + REF_dir + SS_dir + ref_fa_file + ' '\
            + TEMP_dir + work_dir + 'paired_reads_1.fq '\
            + TEMP_dir + work_dir + 'paired_reads_2.fq '\
            + '-o ' + TEMP_dir + work_dir + 'bwa_mapped' + suffix + '.sam'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)     

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nBWA MEM:\n', StdOut, file=log_file)
    
    return ReturnCode, StdOut, StdErr
Exemplo n.º 7
0
def run_Kraken(work_dir):
    ''' 
    Runs Minikraken to classify contigs by species. Output is a number for the 
      classification and kmer counts, which needs to translated into human-
      readable form.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    return: ReturnCode, StdOut, StdErr
    output: 'kraken_out.txt' file
    '''

    print('\nrunning: Kraken')

    # that's the database that comes with the docker image
    KRAKEN_DATABASE = '/kraken-database/minikraken_20171013_4GB'

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + OUTPUT_dir + work_dir\
            + ':' + Kraken_WorkingDir + '" '\
            + '-i ' + Kraken_image + ' kraken '\
            + '--preload --db ' +  KRAKEN_DATABASE + ' '\
            + 'SPAdes_contigs.fa '\
            + '--output kraken_out.txt'

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nKraken:\n', command, file=log_file)

    # the first first param is a '' instead of 'work_dir' because the log.txt
    # file has been moved from /temp/ to /output/
    # see toolshed.run_subprocess()
    ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True)

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nKraken:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 8
0
def run_nw_display(seed, work_dir):
    ''' 
    Generates an ASCII-based tree for the report.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str seed = combination of sp_abbr and reference, e.g.: 'Lpn/F4468/'
    output: a tree, in ASCII format, added to the report.txt file
    '''

    print('\nrunning: NW_display')

    # Note that evolbioinfo/newick_utilities:v1.6 uses "WorkingDir": ""
    # Note: no need to call up 'nw_diplay'
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + 'parsnp/' + seed\
            + ':' + NU_WorkingDir + '" '\
            + '-i ' + NU_image + ' '\
            + 'parsnp.tree'

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nNewick display:\n', command, file=log_file)

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True)

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nNewick display:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 9
0
def run_freebayes(work_dir, ref_fa_file, SS_dir, suffix):
    '''
    Runs FreeBayes to call SNPs, INDELs, and complex mutations.
      -p INT    ploidy of the organism
      -f FILE   Use FILE as the reference sequence for analysis. An index file 
                (FILE.fai) will be created if none exists.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping
    output: a VCF-file, 'freebayes_all.vcf'    
    '''

    print('\nrunning: Freebayes')

    command = 'docker run'\
            + ' -v "' + BASE_PATH\
            + ':' + Freebayes_WorkingDir + '" '\
            + '-i ' + Freebayes_image + ' freebayes '\
            + '-f ' + REF_dir + SS_dir + ref_fa_file + ' '\
            + '-p 1 '\
            + TEMP_dir + work_dir + 'marked_duplicates' + suffix + '.bam '\
            + '> ' + BASE_PATH + TEMP_dir + work_dir + 'freebayes_all.vcf'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nFreebayes:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 10
0
def run_mash_dist(work_dir, ref_msh_file, query_msh_file, suffix):
    
    """ 
    Returns the distance between the references and the query 
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str ref_msh_file = reference sketch file
    param: str query_msh_file = query sketch file 
    param: str suffix = 'FAvNCBI' or 'RvSp'
    output: 'distances_' + suffix + '.tab' file, suffix = 'FAvNCBI' or 'RvSp'
    """ 

    print('\nrunning: Mash dist')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH\
            + ':' + Mash_WorkingDir + '" '\
            + '-i ' + Mash_image + ' mash dist '\
            + ref_msh_file + ' ' + query_msh_file + ' '\
            + '> ' + BASE_PATH + TEMP_dir + work_dir + 'distances_'\
            + suffix + '.tab'
    
    print('\n## Running:\n', command, '\n')    
    # execute 'mash dist' and write results to file
        
    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nMash distance:\n', StdOut, file=log_file)
Exemplo n.º 11
0
def run_quast(work_dir, SS_dir, ref_fa_file, check_seq_file):
    ''' 
    Runs Quast, a quality assessment tool for assemblies.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    param: str check_seq_file = name of a sequence file to be QC'd
    output: Quast generates a number of files that will be deposited in the 
            new 'temp/Quast/' folder
    '''

    print('\nrunning: Quast')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH +\
            ':' + Quast_WorkingDir + '" '\
            + '-i ' + Quast_image + ' quast.py '\
            + '-o temp/' + work_dir + 'quast/ '\
            + '-R ' + REF_dir + SS_dir + ref_fa_file\
            + ' --fast '\
            + TEMP_dir + work_dir + check_seq_file

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nBWA index:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 12
0
def run_Kraken_translate(work_dir):
    ''' 
    Converts the initial Kraken output into human-readable form.        
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    return: ReturnCode, StdOut, StdErr
    '''

    print('\nrunning: Kraken-translate')

    # that's the database that comes with the docker image
    KRAKEN_DATABASE = '/kraken-database/minikraken_20171013_4GB'

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + OUTPUT_dir + work_dir\
            + ':' + Kraken_WorkingDir + '" '\
            + '-i ' + Kraken_image + ' kraken-translate '\
            + '--db ' +  KRAKEN_DATABASE + ' '\
            + 'kraken_out.txt '\
            + '> ' + BASE_PATH + OUTPUT_dir + work_dir + 'kraken_res.txt'

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nKraken-translate:\n', command, file=log_file)

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True)

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nKraken-translate:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 13
0
def run_samtools_faidx(work_dir, SS_dir, ref_fa_file):
    '''
    Generates a FAI index file, required for FreeBayes.
      Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    return: ReturnCode, StdOut, StdErr
    output: index files
    '''

    print('\nrunning: Samtools faidx')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + REF_dir + SS_dir\
            + ':' + Samtools_WorkingDir + '" '\
            + '-i ' + Samtools_image + ' samtools faidx '\
            + ref_fa_file

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nsamtools faidx:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 14
0
def run_fastqc(work_dir, proc_reads):
    """ 
    Runs FastQC on a (processed) read file.
    -d DIR   directory for temporary files when generating report images 
             (default: '?')
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str proc_reads = name of file with forward or reverse reads 
           processed by Trimmomatic
    output: FastQC files 'read_file_fastqc.html' and 'read_file_fastqc.zip'
    """

    print('\nrunning: FastQC')

    command  = 'docker run --rm=True -u $(id -u):$(id -g) '\
             + '-v "' + BASE_PATH + TEMP_dir + work_dir \
             + ':' + FastQC_WorkingDir + '" '\
             + '-i ' + FastQC_image + ' fastqc '\
             + '-d temp/ '\
             + proc_reads

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nFastQC:\n', StdOut, file=log_file)
Exemplo n.º 15
0
def run_bwa_index(work_dir, SS_dir, ref_fa_file):

    ''' 
    Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a fasta file.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    return: ReturnCode, StdOut, StdErr
    output: index files
    '''

    print('\nrunning: BWA index')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + REF_dir + SS_dir\
            + ':' + BWA_WorkingDir + '" '\
            + '-i ' + BWA_image + ' bwa index '\
            + ref_fa_file

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) 

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nBWA index:\n', StdOut, file=log_file)
    
    return ReturnCode, StdOut, StdErr
Exemplo n.º 16
0
def run_samtools_depth(work_dir, THREADS, suffix):

    '''
    runs: samtools depth
    -aa   output absolutely all positions
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str THREADS = number of threads available
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping 
    return: ReturnCode, StdOut, StdErr
    output: 'samtools_depth' + suffix + '.txt' file
    '''

    print('\nrunning: Samtools depth')
        
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + Samtools_WorkingDir + '" '\
            + '-i ' + Samtools_image + ' samtools depth '\
            + '-aa '\
            + 'marked_duplicates' + suffix + '.bam '\
            + '> ' + BASE_PATH + TEMP_dir + work_dir\
            + 'temp/samtools_depth' + suffix + '.txt'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)  
    
    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nsamtools depth:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 17
0
def run_parsnp(THREADS, work_dir, seed, isolate, include_all):
    ''' 
    Runs Parsnp for core-genome alignment and analysis.
    parsnp accepts single- and multi-fasta files containing 'ACGTN' or
    'acgtn' or a mix
      -r REF = specify the reference genome for Parsnp: either the isolate for 
               new references or the mapping reference
      -o DIR = output directory; default [./P_CURRDATE_CURRTIME]
      -c     = forces inclusion of all genomes in a given directory; remove to 
               exclude strains that are too distant, which can cause Parsnp to 
               fail
      -d DIR = directory containing genomes/contigs/scaffolds; Note: no '/' 
               needed after DIR, added automatically
      -v FLAG = verbose output? (default = NO)
      -p INT = number of threads to use? (default= 1)
    param: str THREADS = number of threads available
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str seed = combination of sp_abbr and reference, e.g.: 'Lpn/F4468/'
    param: str isolate = isolate name, e.g.: 'IDR001234'
    param: bool include_all = if True, forces the inclusion of all genomes in
           a given directory, might lead to a crash of Parsnp if a genome is
           too distant; if False, uses only similar genomes, which might 
           exclude genomes of interest
    '''

    print('\nrunning: Parsnp')

    # select a reference genome, e.g. 'ref1.fa' for 'Aba/ref1/' or 'iso1.fa'
    # if 'iso1' is a new reference
    parsnp_reference = seed.split('/')[-2] + '.fa'
    if 'All_refs' in seed:
        parsnp_reference = isolate + '.fa'

    # force inclusion of all genomes
    force_all = ' '
    if include_all:
        force_all = '-c '

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH\
            + ':' + Parsnp_WorkingDir + '" '\
            + '-i ' + Parsnp_image + ' parsnp '\
            + '-d ' + GENOMES_dir + seed + ' '\
            + '-r ' + GENOMES_dir + seed + parsnp_reference + ' '\
            + '-o ' + TEMP_dir + 'parsnp/' + seed + ' '\
            + force_all\
            + '-v -p ' + THREADS

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nParsnp:\n', command, file=log_file)

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True)

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nParsnp:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 18
0
def run_spades(work_dir, THREADS, MEMORY, max_read_len):
    '''
    de novo genome assembler
      usage: spades.py [options] -o <out_dir>
      -o <out _dir>     directory to store all the resulting files (required)   
      -1 <filename>     file with forward paired-end reads
      -2 <filename>     file with reverse paired-end reads
      -t <int>          number of threads. [default: 16]
      -m <int>          RAM limit for SPAdes in Gb (terminates if exceeded). 
                        [default: 250]
      -k <int,int,...>  Comma-separated list of k-mer sizes to be used for 
                         250bp reads; use "-k 21,33,55,77" for 150bp reads
      --careful         tries to reduce number of mismatches and short indels
      --cov-cutoff      Read coverage cutoff value. Must be a positive float 
                         value, or 'auto', or 'off'.  When 'auto': SPAdes 
                         automatically computes coverage threshold using 
                         conservative strategy    
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str THREADS = number of threads available:  + '-t ' + THREADS + ' '\
    param: str MEMORY = available memory:              + '-m ' + MEMORY + ' '\
    param: int max_read_len = length of largest read, important for selecting
           the size of kmers
    return: ReturnCode, StdOut, StdErr
    output: folder with results
    '''

    print('\nrunning: SPAdes\n')

    if max_read_len > 175:
        k_param = ' -k 21,33,55,77,99,127'
    else:
        k_param = ' -k 21,33,55,77'

    command  = 'docker run --rm=True -u $(id -u):$(id -g) '\
             + '-v "' + BASE_PATH + TEMP_dir + work_dir\
             + ':' + SPAdes_WorkingDir + '" '\
             + '-i ' + SPAdes_image + ' spades.py '\
             + '-1 paired_reads_1.fq '\
             + '-2 paired_reads_2.fq '\
             + k_param\
             + ' --careful --cov-cutoff auto '\
             + '-o SPAdes'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nSPAdes, all reads (abbreviated):\n',
              StdOut.replace('\t', ' ').replace('\n', ' ')[:700],
              file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 19
0
def run_mash_sketch(active_folder, work_dir, out_file, in_data, 
                    in_data_type=''):

    """
    Runs Mash sketch on one or more FASTQ or FASTA files
      '.msh' will be added automatically to out_file
    param: str active_folder = path to one of three possible folders
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str out_file = name of the mash sketch output file
    param: list in_data = list of one or more FASTA file(s)
    param: str in_data_type = determines which sketch options to use
    output: a MSH sketch file for the input sequences
    """

    print('\nrunning: Mash sketch')
    
    # parameters for running Mash sketch: few, short kmers for reads; 
    #  more, long kmers for genomes
    # -k = kmer size
    # -s = Sketch size, number of min-hashes
	# -m = Minimum copies of ea. kmer required to pass reads noise filter
    # no parameters => default settings: -k 21, -s 1000
    if in_data_type == 'lo_genomes':
        param = '-k 16 -s 400 ' 
    elif in_data_type == 'comb_reads':
        param = '-m 2 -k 16 -s 400 ' 
    else:
        param = ''
        
    # one or more files that need to be sketched; multiple files are separated 
    #   by a ' ', e.g.: 'mash sketch -o outfile Lpn.fa Tmi.fa. Eco.fa'    
    lo_files = ' '.join(in_data)
    
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
              + '-v "' + active_folder\
              + ':' + Mash_WorkingDir + '" '\
              + '-i ' + Mash_image + ' mash sketch '\
              + param\
              + '-o ' + out_file + ' '\
              + lo_files

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)
    
    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nMash sketch:\n', StdOut, file=log_file)
Exemplo n.º 20
0
def run_samtools_flagstat(work_dir, THREADS, suffix, MAPPED_THRESHOLD):

    '''
    runs: samtools flagstat
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str THREADS = number of threads available
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping 
    param: int MAPPED_THRESHOLD = min percentage of mapped reads 
    return: ReturnCode, StdOut, StdErr
    return: float percent_mapped = percentage of mapped reads
    output: text added to report
    '''

    print('\nrunning: Samtools flagstat')
        
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + Samtools_WorkingDir + '" '\
            + '-i ' + Samtools_image + ' samtools flagstat '\
            + '--threads ' + THREADS + ' '\
            + 'marked_duplicates' + suffix + '.bam'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)   
    
    percent_mapped = float(StdOut.split('mapped (')[1].split('%')[0])
    print('percent_mapped:', percent_mapped)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nsamtools flagstat:\n', StdOut, file=log_file)

    with open(BASE_PATH + TEMP_dir + work_dir + 'report.txt', 'a') as report:
        print('\nAlignment QC (Samtools flagstat):', file=report)
        print(StdOut.replace('stdout:\n',''), file=report)
        print('\nPercentage of mapped reads:', percent_mapped, file=report)

        if percent_mapped <= MAPPED_THRESHOLD:
            print('\nNOTE:\nPercentage of mapped reads below threshold.\n'\
                + 'Adding the isolate to the list of candidate reference '\
                + 'genomes.',
                  file=report)

    return ReturnCode, StdOut, StdErr, percent_mapped
Exemplo n.º 21
0
def run_samtools_idxstats(work_dir, THREADS, suffix):

    '''
    runs: samtools idxstats
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str THREADS = number of threads available
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping 
    return: ReturnCode, StdOut, StdErr
    return: float percent_mapped = percentage of mapped reads
    output: text added to report
    '''

    print('\nrunning: Samtools idxstats')
        
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + Samtools_WorkingDir + '" '\
            + '-i ' + Samtools_image + ' samtools idxstats '\
            + '--threads ' + THREADS + ' '\
            + 'marked_duplicates' + suffix + '.bam'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)  
    
    # e.g: stdout=b'NZ_CP006644.1\t6205897\t188455\t13709\nNZ_CP011450.1\ 
    #        t374401\t6147\t317\n*\t0\t0\t2900154\n')    
    # e.g.:
    #  NZ_CP006644.1   6205897 188455  13709
    #  NZ_CP011450.1   374401  6147    317
    #  *       0       0       2900154
    # e.g.: ['NZ_CP006644.1', '6205897', '188455', '13709', 'NZ_CP011450.1', 
    #        '374401', '6147', '317', '*', '0', '0', '2900154', '']

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nsamtools idxstats:\n', StdOut, file=log_file)

    with open(BASE_PATH + TEMP_dir + work_dir + 'report.txt', 'a') as report:
        print('\n\nAlignment QC (Samtools idxstats):', file=report)
        print('ref_fa_file\tlen\tmapped\tunmapped', file=report)
        print(StdOut.replace('stdout:\n',''), file=report)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 22
0
def run_vcffilter(work_dir, DP_max):
    '''
    Runs vcffilter to remove low quality SNp.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: int DP_max = maximum total read depth at that SNP locus
    output: a VCF-file, 'freebayes.vcf'    
    '''

    print('\nrunning: vcffilter')

    # filtering thresholds to sort out low probability SNPs
    QUAL_threshold = 20  # min SNP quality (phred scale)
    DP_min = 10  # min Total read depth at the locus
    QA_threshold = 20  # min Alternate allele quality sum (phred scale)
    AO_DP_ratio = 0.899  # min percentage of reads supporting the SNP, where
    #   AO is the Count of full observations of this
    #   alternate haplotype.
    # hard filter implemented as per Erik Garrison (see command):
    # SAF > 0 & SAR > 0   # remove alleles that are only seen on one strand

    command = 'docker run'\
            + ' -v "' + BASE_PATH\
            + ':' + VCFlib_WorkingDir + '" '\
            + '-i ' + VCFlib_image + ' vcffilter '\
            + '-f  "QUAL > ' + str(QUAL_threshold) + ' '\
            + '& DP > ' + str(DP_min) + ' & DP < ' + str(DP_max) + ' '\
            + '& QA > ' + str(QA_threshold) + ' '\
            + '& SAF > 0 & SAR > 0 ' \
            + '& AO > ' + str(AO_DP_ratio) + ' * DP" '\
            + TEMP_dir + work_dir + 'freebayes_all.vcf '\
            + '> ' + BASE_PATH + TEMP_dir + work_dir + 'freebayes.vcf'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nFreebayes:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 23
0
def run_nw_display_svg(seed, work_dir):
    ''' 
    Generates a prettier tree in SVG format.
    uses a css.map file to change the looks of the tree in SVG format
      -s     = produces a pretty Scalable Vector Graphic (.svg) file for 
               viewing in a web browser
      -w INT = width of the figure in pixels (when in -s mode, columns else)
    param: str seed = combination of sp_abbr and reference, e.g.: 'Lpn/F4468/'
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    output: tree in SVG format
    '''

    print('\nrunning: NW_display')

    # Note that evolbioinfo/newick_utilities:v1.6 uses "WorkingDir": ""
    # Note: no need to call up 'nw_diplay'
    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + 'parsnp/' + seed\
            + ':' + NU_WorkingDir + '" '\
            + '-i ' + NU_image + ' '\
            + '-s -w 700 -b opacity:0 '\
            + '-o parsnp_ornament.map '\
            + 'parsnp.tree'

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nNewick display:\n', command, file=log_file)

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True)

    tree_data = StdOut.replace('stdout:\n', '')

    with open(BASE_PATH + TEMP_dir + 'parsnp/' + seed + 'parsnp_tree.svg',
              'w') as out_file:
        print(tree_data, file=out_file)

    with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nNewick display:\n', StdErr, file=log_file)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 24
0
def run_trimmomatic(work_dir, F_READS, R_READS, THREADS, MinLen='100'):
    '''
    Trimming of Illumina reads.
      PE: paired ends = two input, four output files    
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param str F_READS = file with forward paired reads
    param str R_READS = file with reverse paired reads
    param str THREADS = number of threads available
    param str MinLen  = minimum read length, default here '100'
    return: ReturnCode, StdOut, StdErr
    output: four read files: paired/unpaired and forward/reverse
    '''

    print('\nrunning: Trimmomatic\n')

    INPUT_FILES  = 'raw_reads_noG_1.fq '\
                 + 'raw_reads_noG_2.fq '

    OUTPUT_FILES = 'paired_reads_1.fq '\
                 + 'temp/unpaired_reads_1.fq '\
                 + 'paired_reads_2.fq '\
                 + 'temp/unpaired_reads_2.fq '

    command  = 'docker run --rm=True -u $(id -u):$(id -g) '\
             + '-v "' + BASE_PATH + TEMP_dir + work_dir\
             + ':' + Trimmomatic_WorkingDir + '" '\
             + '-i ' + Trimmomatic_image + ' trimmomatic PE '\
             + INPUT_FILES\
             + OUTPUT_FILES\
             + '-threads ' + THREADS + ' '\
             + '-trimlog temp/trimmomatic_log.txt '\
             + 'ILLUMINACLIP:NexteraPE-PE.fa:2:30:10 '\
             + 'LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:' + MinLen

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    return ReturnCode, StdOut, StdErr
Exemplo n.º 25
0
def run_mash_info(active_folder, work_dir, out_file):
    
    """
    Writes the data present in the '.msh' file to the log.txt file
    param: str active_folder = path to one of three possible folders
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str out_file = name of the output file
    output: text added to log.txt file
    """

    print('\nrunning: Mash info')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + active_folder\
            + ':' + Mash_WorkingDir + '" '\
            + '-i ' + Mash_image + ' mash info '\
            + out_file

    # look up the genomes present in the .msh file and print to the log file    
    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True)
    
    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nMash sketch results:\n', StdOut, file=log_file)
Exemplo n.º 26
0
def run_qualimap(work_dir, suffix):
    ''' 
    Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a FASTA file.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping
    return: ReturnCode, StdOut, StdErr       
    '''

    print('\nrunning: Qualimap')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
            + ':' + Qualimap_WorkingDir + '" '\
            + '-i ' + Qualimap_image + ' qualimap bamqc '\
            + '-bam marked_duplicates' + suffix + '.bam'

    ReturnCode, StdOut, StdErr = toolshed.run_subprocess(
        work_dir, command, True)

    with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file:
        print('\nqualimap:\n', StdOut, file=log_file)

    return ReturnCode, StdOut, StdErr