def run_docker_pull(): ''' Downloads (pulls) all docker images (programs) in the list, except some that will be used after all individual samples have been run (Parsnp) or that are mempry intensive (Kraken). ''' base_cmd = 'docker pull ' DO_IMAGES = config.get_DO_IMAGES() for program in DO_IMAGES.keys(): if program not in ['Kraken', 'Parsnp']: command = base_cmd + DO_IMAGES[program][0] run_subprocess('', command, use_logging=False)
last update: 24 September 2020 """ import os import shutil import re import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir'] GENOMES_dir = config.get_DO_PATHS()['GENOMES_dir'] Parsnp_image, Parsnp_WorkingDir = config.get_DO_IMAGES()['Parsnp'] NU_image, NU_WorkingDir = config.get_DO_IMAGES()['Newick_utils'] ##### housekeeping ############################################################ def sort_input(lo_phylo_tree_data): ''' Converts a list of (sp_abbr, isolate, work_dir, ref_name), collected by pipeline_master.py, into a dict of pipeline/ref/ : [(work_dir, isolate)] items. param: list lo_phylo_tree_data = list of (pipeline, work_dir, reference, isolate) return: dict do_seeds = dictionary of pipeline/ref/ : [(work_dir, isolate)] '''
Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import numpy as np import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] OUTPUT_dir = config.get_DO_PATHS()['OUTPUT_dir'] Kraken_image, Kraken_WorkingDir = config.get_DO_IMAGES()['Kraken'] ##### running Kraken ########################################################## def run_Kraken(work_dir): ''' Runs Minikraken to classify contigs by species. Output is a number for the classification and kmer counts, which needs to translated into human- readable form. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' return: ReturnCode, StdOut, StdErr output: 'kraken_out.txt' file ''' print('\nrunning: Kraken')
@authors: Wolfgang Haas, Pascal Lapierre, and Kimberlee A. Musser Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] Qualimap_image, Qualimap_WorkingDir = config.get_DO_IMAGES()['Qualimap'] def run_qualimap(work_dir, suffix): ''' Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a FASTA file. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr ''' print('\nrunning: Qualimap') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\
Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import os import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] REF_dir = config.get_DO_PATHS()['REF_dir'] Quast_image, Quast_WorkingDir = config.get_DO_IMAGES()['Quast'] def run_quast(work_dir, SS_dir, ref_fa_file, check_seq_file): ''' Runs Quast, a quality assessment tool for assemblies. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file param: str check_seq_file = name of a sequence file to be QC'd output: Quast generates a number of files that will be deposited in the new 'temp/Quast/' folder ''' print('\nrunning: Quast')
""" import subprocess as sub import os import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] REF_dir = config.get_DO_PATHS()['REF_dir'] Mash_image, Mash_WorkingDir = config.get_DO_IMAGES()['Mash'] ##### house-keeping functions ################################################# def make_lo_genomes(active_folder, isolate=''): """ Returns a list with the path and name of all files in a genomes subfolder param: str active_folder = path to the folder with the reference genomes param: str isolate = isolate name, e.g.: 'IDR001234' return: list lo_genomes = all genome names present in that folder """ lo_genomes = []
import matplotlib # sets the backend to anti-grain geometry for .png output # prevents RuntimeError: Invalid DISPLAY variable in Linux matplotlib.use('agg') import matplotlib.pyplot as plt import os from numpy import ceil import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] REF_dir = config.get_DO_PATHS()['REF_dir'] Samtools_image, Samtools_WorkingDir = config.get_DO_IMAGES()['Samtools'] Freebayes_image, Freebayes_WorkingDir = config.get_DO_IMAGES()['Freebayes'] VCFlib_image, VCFlib_WorkingDir = config.get_DO_IMAGES()['VCFlib'] def run_samtools_faidx(work_dir, SS_dir, ref_fa_file): ''' Generates a FAI index file, required for FreeBayes. Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]] param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file return: ReturnCode, StdOut, StdErr output: index files '''
[email protected] last update: 24 September 2020 """ import config import matplotlib.pyplot as plt import numpy as np import os import shutil import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] SPAdes_image, SPAdes_WorkingDir = config.get_DO_IMAGES()['SPAdes'] ##### runs SPAdes and housekeeping functions ################################## def run_spades(work_dir, THREADS, MEMORY, max_read_len): ''' de novo genome assembler usage: spades.py [options] -o <out_dir> -o <out _dir> directory to store all the resulting files (required) -1 <filename> file with forward paired-end reads -2 <filename> file with reverse paired-end reads -t <int> number of threads. [default: 16] -m <int> RAM limit for SPAdes in Gb (terminates if exceeded). [default: 250] -k <int,int,...> Comma-separated list of k-mer sizes to be used for
import os import numpy as np import config import toolshed BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] REF_dir = config.get_DO_PATHS()['REF_dir'] # docker images BWA_image, BWA_WorkingDir = config.get_DO_IMAGES()['BWA'] Samtools_image, Samtools_WorkingDir = config.get_DO_IMAGES()['Samtools'] Picard_image, Picard_WorkingDir = config.get_DO_IMAGES()['Picard'] BCFtools_image, BCFtools_WorkingDir = config.get_DO_IMAGES()['BCFtools'] def run_bwa_index(work_dir, SS_dir, ref_fa_file): ''' Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a fasta file. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file return: ReturnCode, StdOut, StdErr output: index files
Wadsworth Center, New York State Department of Health 120 New Scotland Ave., Albany, New York 12208 [email protected] last update: 24 September 2020 """ import zipfile import os import toolshed import config BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] FastQC_image, FastQC_WorkingDir = config.get_DO_IMAGES()['FastQC'] def run_fastqc(work_dir, proc_reads): """ Runs FastQC on a (processed) read file. -d DIR directory for temporary files when generating report images (default: '?') param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str proc_reads = name of file with forward or reverse reads processed by Trimmomatic output: FastQC files 'read_file_fastqc.html' and 'read_file_fastqc.zip' """ print('\nrunning: FastQC')
[email protected] last update: 24 September 2020 """ import os import toolshed import shutil import config from numpy import mean, std BASE_PATH = config.get_DO_PATHS()['BASE_PATH'] TEMP_dir = config.get_DO_PATHS()['TEMP_dir'] REF_dir = config.get_DO_PATHS()['REF_dir'] Trimmomatic_image, Trimmomatic_WorkingDir = config.get_DO_IMAGES( )['Trimmomatic'] ##### remove reads with too many Gs in a row ####################################### def get_header_symbol(file): ''' Returns the first character of the first line, which identifies the header of a read, ususally a "@". helper function to remove_poly_Gs() param: str file = name of the read file return: str CHAR = first character of the header ''' with open(file, 'r') as infile: