예제 #1
0
def run_docker_pull():
    '''
    Downloads (pulls) all docker images (programs) in the list, except some 
    that will be used after all individual samples have been run (Parsnp) or
    that are mempry intensive (Kraken).
    '''

    base_cmd = 'docker pull '
    DO_IMAGES = config.get_DO_IMAGES()

    for program in DO_IMAGES.keys():
        if program not in ['Kraken', 'Parsnp']:
            command = base_cmd + DO_IMAGES[program][0]
            run_subprocess('', command, use_logging=False)
예제 #2
0
    
last update: 24 September 2020                                
"""

import os
import shutil
import re
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir']
GENOMES_dir = config.get_DO_PATHS()['GENOMES_dir']

Parsnp_image, Parsnp_WorkingDir = config.get_DO_IMAGES()['Parsnp']
NU_image, NU_WorkingDir = config.get_DO_IMAGES()['Newick_utils']

##### housekeeping ############################################################


def sort_input(lo_phylo_tree_data):
    ''' 
    Converts a list of (sp_abbr, isolate, work_dir, ref_name), collected by 
      pipeline_master.py, into a dict of pipeline/ref/ : [(work_dir, isolate)] 
      items.
    param: list lo_phylo_tree_data = list of (pipeline, work_dir, reference, 
      isolate)
    return: dict do_seeds = dictionary of pipeline/ref/ : [(work_dir, isolate)]
    '''
예제 #3
0
    Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
    
last update: 24 September 2020                                
"""

import numpy as np
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir']

Kraken_image, Kraken_WorkingDir = config.get_DO_IMAGES()['Kraken']

##### running Kraken ##########################################################


def run_Kraken(work_dir):
    ''' 
    Runs Minikraken to classify contigs by species. Output is a number for the 
      classification and kmer counts, which needs to translated into human-
      readable form.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    return: ReturnCode, StdOut, StdErr
    output: 'kraken_out.txt' file
    '''

    print('\nrunning: Kraken')
예제 #4
0
@authors: 
    Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
    
last update: 24 September 2020                                
"""

import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']

Qualimap_image, Qualimap_WorkingDir = config.get_DO_IMAGES()['Qualimap']


def run_qualimap(work_dir, suffix):
    ''' 
    Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a FASTA file.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str suffix = distinguishes files if more than one reference was 
           used for read mapping
    return: ReturnCode, StdOut, StdErr       
    '''

    print('\nrunning: Qualimap')

    command = 'docker run --rm=True -u $(id -u):$(id -g) '\
            + '-v "' + BASE_PATH + TEMP_dir + work_dir\
예제 #5
0
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
    
last update: 24 September 2020                                
"""

import os
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
REF_dir = config.get_DO_PATHS()['REF_dir']

Quast_image, Quast_WorkingDir = config.get_DO_IMAGES()['Quast']


def run_quast(work_dir, SS_dir, ref_fa_file, check_seq_file):
    ''' 
    Runs Quast, a quality assessment tool for assemblies.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    param: str check_seq_file = name of a sequence file to be QC'd
    output: Quast generates a number of files that will be deposited in the 
            new 'temp/Quast/' folder
    '''

    print('\nrunning: Quast')
예제 #6
0
"""


import subprocess as sub
import os
import config
import toolshed



BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir  = config.get_DO_PATHS()['TEMP_dir']
REF_dir   = config.get_DO_PATHS()['REF_dir'] 


Mash_image, Mash_WorkingDir = config.get_DO_IMAGES()['Mash']


##### house-keeping functions #################################################

def make_lo_genomes(active_folder, isolate=''):
    
    """
    Returns a list with the path and name of all files in a genomes subfolder
    param: str active_folder = path to the folder with the reference genomes
    param: str isolate = isolate name, e.g.: 'IDR001234'
    return: list lo_genomes = all genome names present in that folder
    """
   
    lo_genomes = []
    
예제 #7
0
import matplotlib
# sets the backend to anti-grain geometry for .png output
# prevents RuntimeError: Invalid DISPLAY variable in Linux
matplotlib.use('agg')
import matplotlib.pyplot as plt
import os
from numpy import ceil
import config
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
REF_dir = config.get_DO_PATHS()['REF_dir']

Samtools_image, Samtools_WorkingDir = config.get_DO_IMAGES()['Samtools']
Freebayes_image, Freebayes_WorkingDir = config.get_DO_IMAGES()['Freebayes']
VCFlib_image, VCFlib_WorkingDir = config.get_DO_IMAGES()['VCFlib']


def run_samtools_faidx(work_dir, SS_dir, ref_fa_file):
    '''
    Generates a FAI index file, required for FreeBayes.
      Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    return: ReturnCode, StdOut, StdErr
    output: index files
    '''
예제 #8
0
    [email protected]
    
last update: 24 September 2020                                
"""

import config
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import toolshed

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']

SPAdes_image, SPAdes_WorkingDir = config.get_DO_IMAGES()['SPAdes']

##### runs SPAdes and housekeeping functions ##################################


def run_spades(work_dir, THREADS, MEMORY, max_read_len):
    '''
    de novo genome assembler
      usage: spades.py [options] -o <out_dir>
      -o <out _dir>     directory to store all the resulting files (required)   
      -1 <filename>     file with forward paired-end reads
      -2 <filename>     file with reverse paired-end reads
      -t <int>          number of threads. [default: 16]
      -m <int>          RAM limit for SPAdes in Gb (terminates if exceeded). 
                        [default: 250]
      -k <int,int,...>  Comma-separated list of k-mer sizes to be used for 
예제 #9
0

import os
import numpy as np
import config
import toolshed



BASE_PATH   = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir    = config.get_DO_PATHS()['TEMP_dir']
REF_dir     = config.get_DO_PATHS()['REF_dir'] 


# docker images 
BWA_image, BWA_WorkingDir           = config.get_DO_IMAGES()['BWA']
Samtools_image, Samtools_WorkingDir = config.get_DO_IMAGES()['Samtools']
Picard_image, Picard_WorkingDir     = config.get_DO_IMAGES()['Picard']
BCFtools_image, BCFtools_WorkingDir = config.get_DO_IMAGES()['BCFtools']



def run_bwa_index(work_dir, SS_dir, ref_fa_file):

    ''' 
    Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a fasta file.
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str SS_dir = species-specific directory, e.g.: 'Lpn/'
    param: str ref_fa_file = name of a reference strain's FASTA file
    return: ReturnCode, StdOut, StdErr
    output: index files
예제 #10
0
    Wadsworth Center, New York State Department of Health
    120 New Scotland Ave., Albany, New York 12208
    [email protected]
    
last update: 24 September 2020                                
"""

import zipfile
import os
import toolshed
import config

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']

FastQC_image, FastQC_WorkingDir = config.get_DO_IMAGES()['FastQC']


def run_fastqc(work_dir, proc_reads):
    """ 
    Runs FastQC on a (processed) read file.
    -d DIR   directory for temporary files when generating report images 
             (default: '?')
    param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/'
    param: str proc_reads = name of file with forward or reverse reads 
           processed by Trimmomatic
    output: FastQC files 'read_file_fastqc.html' and 'read_file_fastqc.zip'
    """

    print('\nrunning: FastQC')
예제 #11
0
    [email protected]
    
last update: 24 September 2020
"""

import os
import toolshed
import shutil
import config
from numpy import mean, std

BASE_PATH = config.get_DO_PATHS()['BASE_PATH']
TEMP_dir = config.get_DO_PATHS()['TEMP_dir']
REF_dir = config.get_DO_PATHS()['REF_dir']

Trimmomatic_image, Trimmomatic_WorkingDir = config.get_DO_IMAGES(
)['Trimmomatic']

##### remove reads with too many Gs in a row #######################################


def get_header_symbol(file):
    '''
    Returns the first character of the first line, which identifies the 
      header of a read, ususally a "@".
    helper function to remove_poly_Gs()
    param: str file = name of the read file
    return: str CHAR = first character of the header
    
    '''

    with open(file, 'r') as infile: