def combine_metrics(args): """Combine the per-sample metrics files into a single table of metrics for all samples. This function expects, or creates '(*)', the following files arranged in the following way: samples sample_name_one/metrics metrics.tsv All the input files are created outside of this function. Before running this command, the metrics file for each sample must be created by the collect_metrics command. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace sampleDirsFile : Path to file containing a list of directories -- one per sample metricsFileName : File name of the metrics files which must exist in each of the sample directories mergedMetricsFile : Path to the output merged metrics file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== sample_directories_list_path = args.sampleDirsFile utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path], error_handler="global") metrics_file_name = args.metricsFileName merged_metrics_path = args.mergedMetricsFile with open(sample_directories_list_path, "r") as f: sample_directories = [line.rstrip() for line in f] sample_directories = [d for d in sample_directories if d] metrics_files = [ os.path.join(d, metrics_file_name) for d in sample_directories ] #========================================================================== # Check if merge has already been done #========================================================================== needs_rebuild = utils.target_needs_rebuild(metrics_files, merged_metrics_path) if not args.forceFlag and not needs_rebuild: verbose_print( "# The merged metrics file is already freshly created. Use the -f option to force a rebuild." ) return #========================================================================== # Parse the metrics files and print the tabular results #========================================================================== with open(merged_metrics_path, 'w') as f: # Emit the column headings column_headings = [ "Sample", "Fastq Files", "Fastq File Size", "Machine", "Flowcell", "Number of Reads", "Duplicate Reads", "Percent of Reads Mapped", "Percent Proper Pair", "Average Insert Size", "Average Pileup Depth", "Phase1 SNPs", "Phase1 Preserved SNPs", "Phase2 SNPs", "Phase2 Preserved SNPs", "Missing SNP Matrix Positions", "Missing Preserved SNP Matrix Positions", "Excluded Sample", "Excluded Preserved Sample", "Warnings and Errors" ] if not args.spaceHeadings: column_headings = [ heading.replace(' ', '_') for heading in column_headings ] tabbed_headings = '\t'.join(column_headings) f.write(tabbed_headings + '\n') # Reads the metrics from each sample, and emit the values for metrics_file in metrics_files: verbose_print("Processing " + metrics_file) message = None if not os.path.isfile(metrics_file): message = "Sample metrics file %s does not exist." % metrics_file elif os.path.getsize(metrics_file) == 0: message = "Sample metrics file %s is empty." % metrics_file if message: f.write(message + '\n') utils.sample_warning(message) continue metrics = utils.read_properties(metrics_file) f.write(quoted(metrics.get("sample", "")) + '\t') f.write(quoted(metrics.get("fastqFileList", "")) + '\t') f.write(metrics.get("fastqFileSize", "") + '\t') f.write(metrics.get("machine", "") + '\t') f.write(metrics.get("flowcell", "") + '\t') f.write(metrics.get("numberReads", "") + '\t') f.write(metrics.get("numberDupReads", "") + '\t') f.write(metrics.get("percentReadsMapped", "") + '\t') f.write(metrics.get("percentProperPair", "") + '\t') f.write(metrics.get("aveInsertSize", "") + '\t') f.write(metrics.get("avePileupDepth", "") + '\t') f.write(metrics.get("phase1Snps", "") + '\t') f.write(metrics.get("phase1SnpsPreserved", "") + '\t') f.write(metrics.get("snps", "") + '\t') f.write(metrics.get("snpsPreserved", "") + '\t') f.write(metrics.get("missingPos", "") + '\t') f.write(metrics.get("missingPosPreserved", "") + '\t') f.write(metrics.get("excludedSample", "") + '\t') f.write(metrics.get("excludedSamplePreserved", "") + '\t') f.write(quoted(metrics.get("errorList", "")) + '\n')
def calculate_snp_distances(args): """Calculate pairwise sample SNP distances. Calculate pairwise SNP distances from the multi-fasta SNP matrix. Generate a file of pairwise distances and a file containing a matrix of distances. This function expects, or creates '(*)', the following files: snpma.fasta snp_distance_pairwise.tsv* snp_distance_matrix.tsv* The files are used as follows: 1. The snpma.fasta input file contains the snp matrix for all samples 2. The snp_distance_pairwise.tsv output file contains a three column tab-separated table of distances between all pairs of samples 2. The snp_distance_matrix.tsv output file contains a matrix of distances between all samples. Parameters ---------- args : Namespace inputFile: File path (not just file name) for the snp matrix in fasta format pairwiseFile: File path (not just file name) of the output pairwise distance file matrixFile: File path (not just file name) for the output distance matrix file Raises: Examples: args = argparse.Namespace args.inputFile = 'snpma.fasta' args.pairwiseFile = 'snp_distance_pairwise.tsv' args.matrixFile = 'snp_distance_matrix.tsv' calculate_snp_distances(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate arguments #========================================================================== input_file = args.inputFile pairwise_file = args.pairwiseFile matrix_file = args.matrixFile force_flag = args.forceFlag bad_file_count = utils.verify_existing_input_files("SNP matrix file", [input_file]) if bad_file_count > 0: utils.global_error( "Error: cannot calculate sequence distances without the snp matrix file." ) if not pairwise_file and not matrix_file: utils.global_error("Error: no output file specified.") #========================================================================== # Check freshness #========================================================================== rebuild_pairwise_file = pairwise_file and utils.target_needs_rebuild( [input_file], pairwise_file) rebuild_matrix_file = matrix_file and utils.target_needs_rebuild( [input_file], matrix_file) if force_flag or rebuild_pairwise_file or rebuild_matrix_file: #------------------------------ # Read in snp matrix file #------------------------------ seqs = {} with open(input_file) as ifile: for line in ifile: line = line.rstrip('\n') if line.startswith('>'): curr_sample = line.lstrip('>') seqs[curr_sample] = '' else: seqs[curr_sample] += str(line) #------------------------------ # Count mismatches #------------------------------ verbose_print( "# %s %s" % (utils.timestamp(), "Calculating all pairwise distances")) ids = sorted(seqs.keys()) pairwise_mismatches = dict() # tuple (seq1 id, seq2 id) -> int for id1, id2 in itertools.combinations(ids, 2): mismatches = utils.calculate_sequence_distance( seqs[id1], seqs[id2]) pairwise_mismatches[(id1, id2)] = mismatches pairwise_mismatches[(id2, id1)] = mismatches #------------------------------ # Print distance files #------------------------------ if pairwise_file: with open(pairwise_file, 'w') as p_out: p_out.write('%s\n' % '\t'.join(['Seq1', 'Seq2', 'Distance'])) for id1, id2 in itertools.product(ids, ids): mismatches = pairwise_mismatches.get( (id1, id2), 0) # zero when id1=id2 p_out.write("%s\t%s\t%i\n" % (id1, id2, mismatches)) if matrix_file: with open(matrix_file, 'w') as m_out: m_out.write('\t%s\n' % '\t'.join(ids)) # matrix header # write table of mismatches for id1 in ids: mismatches = [ pairwise_mismatches.get((id1, id2), 0) for id2 in ids ] mismatch_strs = map(str, mismatches) m_out.write("%s\t%s\n" % (id1, '\t'.join(mismatch_strs))) else: utils.verbose_print( "Distance files have already been freshly built. Use the -f option to force a rebuild." )
def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sam sample_name_one/reads.unsorted.bam* sample_name_one/reads.sorted.bam* sample_name_one/reads.sorted.deduped.bam* sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir sam_file = os.path.join(sample_dir, "reads.sam") utils.verify_non_empty_input_files("Sample SAM file", [sam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Convert sam to bam file, selecting only the mapped reads #========================================================================== # Check for fresh bam file; if not, convert to bam file with only mapped reads unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam") needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Unsorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") # Substitute the default parameters if the user did not specify samtools view parameters samtools_samfilter_params = os.environ.get( "SamtoolsSamFilter_ExtraParams") or "-F 4" command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file verbose_print( "# Convert sam file to bam file with only mapped positions.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view") verbose_print("") #========================================================================== # Sort the BAM file #========================================================================== # Check for fresh sorted bam file; if not, sort it sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam") needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Sorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_sort_extra_params = os.environ.get( "SamtoolsSort_ExtraParams") or "" # Inspect the samtools version to determine how to execute samtools # Use the -o FILE command line option with SAMtools 1.3 and higher samtools_version = version_str.split()[-1] # just the number if samtools_version < "1.3": command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join( sample_dir, "reads.sorted") else: command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file verbose_print("# Convert bam to sorted bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort") verbose_print("") #========================================================================== # Mark duplicate reads, so they will be ignored in subsequent steps #========================================================================== remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": # Check for fresh deduped bam file; if not, remove duplicate reads deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam") needs_rebuild = utils.target_needs_rebuild([sorted_bam_file], deduped_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Deduped bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "picard" not in classpath.lower(): utils.global_error( "Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable." ) else: version_str = utils.extract_version_str( "Picard", "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1" ) picard_jvm_extra_params = os.environ.get( "PicardJvm_ExtraParams") or "" picard_mark_duplicates_extra_params = os.environ.get( "PicardMarkDuplicates_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") tmp_option = " TMP_DIR=" + tmpdir if tmpdir else "" command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join( sample_dir, "duplicate_reads_metrics.txt" ) + tmp_option + ' ' + picard_mark_duplicates_extra_params verbose_print("# Remove duplicate reads from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(deduped_bam_file, "picard MarkDuplicates") verbose_print("") pileup_input_file = deduped_bam_file else: pileup_input_file = sorted_bam_file #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild( [pileup_input_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get( "SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "varscan" not in classpath.lower(): utils.global_error( "Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable." ) else: version_str = utils.extract_version_str( "VarScan", "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2" ) varscan_jvm_extra_params = os.environ.get( "VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get( "VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
def create_snp_reference_seq(options_dict): """Write reference sequence bases at SNP locations to a fasta file. Description: Write reference sequence bases at SNP locations to a fasta file. This function expects, or creates '(*)', the following files: reference.fasta snplist.txt referenceSNP.fasta (*) The files are used as follows: 1. The reference.fasta input file contains the whole-genome reference bases. 2. The snplist.txt input file contains the list of SNP positions across all the samples. 2. The referenceSNP.fasta output file contains the reference bases at the identified SNP locations. The snplist.txt file is created outside of this function. The package documentation provides an example of creating this file based on the lambda_virus sequence that is used as one test for this package. Args: referenceFile: File path (not just file name) for reference sequence (in fasta format snpListFile: File path (not just file name) of text format list of SNP positions snpRefFile: File path (not just file name) for the SNP reference sequence file. Raises: Examples: options_dict = {'referenceFile':'reference.fasta', 'snpListFile':'snplist.txt', 'snpRefFile':'referenceSNP.fasta' } create_snp_reference_seq(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Write reference sequence bases at SNP locations to a fasta file. #========================================================================== reference_file = options_dict['referenceFile'] snp_list_file_path = options_dict['snpListFile'] snp_ref_seq_path = options_dict['snpRefFile'] #========================================================================== # Verify input files exist #========================================================================== bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path]) if bad_file_count > 0: utils.global_error("Error: cannot create the snp reference sequence without the snplist file.") bad_file_count = utils.verify_non_empty_input_files("Reference file", [reference_file]) if bad_file_count > 0: utils.global_error("Error: cannot create the snp reference sequence without the reference fasta file.") #========================================================================== # Find the reference bases at the snp positions #========================================================================== source_files = [reference_file, snp_list_file_path] if options_dict['forceFlag'] or utils.target_needs_rebuild(source_files, snp_ref_seq_path): utils.write_reference_snp_file(reference_file, snp_list_file_path, snp_ref_seq_path) verbose_print("") else: verbose_print("SNP reference sequence %s has already been freshly built. Use the -f option to force a rebuild." % snp_ref_seq_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def create_snp_list(options_dict): """Create SNP list file Description: Create the SNP list -- the list of positions where variants were found and the corresponding list of samples having a variant at each position. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf ... snplist.txt (*) The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files are used to construct the SNP position list. 3. The snplist.txt output file contains the union of the SNP positions and sample names extracted from all the var.flt.vcf files. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories snpListFile: File path (not just file name) of text format list of SNP positions Raises: Examples: options_dict = {'sampleDirsFile':'sampleDirectories.txt', 'vcfFileName':'var.flt.vcf' 'snpListFile':'snplist.txt', } create_snp_list(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Prep work #========================================================================== sample_directories_list_path = options_dict['sampleDirsFile'] bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d] sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories) #========================================================================== # Validate inputs #========================================================================== snp_list_file_path = options_dict['snpListFile'] vcf_file_name = options_dict['vcfFileName'] list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories] bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) #========================================================================== # Read in all vcf files and process into dict of SNPs passing various # criteria. Do this for each sample. Write to file. #========================================================================== if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path): snp_dict = dict() excluded_sample_directories = set() for sample_dir, vcf_file_path in zip(sorted_list_of_sample_directories, list_of_vcf_files): if not os.path.isfile(vcf_file_path): continue if os.path.getsize(vcf_file_path) == 0: continue verbose_print("Processing VCF file %s" % vcf_file_path) sample_name = os.path.basename(os.path.dirname(vcf_file_path)) snp_set = utils.convert_vcf_file_to_snp_set(vcf_file_path) max_snps = options_dict['maxSnps'] if max_snps >= 0 and len(snp_set) > max_snps: verbose_print("Excluding sample %s having %d snps." % (sample_name, len(snp_set))) excluded_sample_directories.add(sample_dir) continue for key in snp_set: if key not in snp_dict: sample_list = [sample_name] snp_dict[key] = sample_list else: sample_list = snp_dict[key] sample_list.append(sample_name) verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files))) utils.write_list_of_snps(snp_list_file_path, snp_dict) verbose_print("") #========================================================================== # Write the filtered list of sample directories #========================================================================== sample_directories_list_path = sample_directories_list_path + ".filtered" with open(sample_directories_list_path, "w") as filtered_samples_file_object: # Loop over the unsorted list to keep the order of samples the same as the original. # This will keep the same HPC log file suffix number. for sample_dir in unsorted_list_of_sample_directories: if sample_dir not in excluded_sample_directories: filtered_samples_file_object.write("%s\n" % sample_dir) else: verbose_print("SNP list %s has already been freshly built. Use the -f option to force a rebuild." % snp_list_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def index_ref(args): """Index the reference genome. Execute an external program (bowtie2 or smalt) to create an index for the reference genome to be used during subsequent alignment. Execute samtools to create the faidx index file to be used during subsequent pileups. The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta # input fasta referenceFile.#.bt2* # bowtie2 output referenceFile.rev.#.bt2* # bowtie2 output referenceFile.sma* # smalt output referenceFile.smi* # smalt output referenceFile.fasta.fai* # samtools faidx output The input fasta file is created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") reference_base_path = os.path.splitext(reference_file_path)[ 0] # strip the file extension # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2" snp_pipeline_aligner = snp_pipeline_aligner.lower() if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.global_error( "Error: only bowtie2 and smalt aligners are supported.") # Create index file for reference if snp_pipeline_aligner == "bowtie2": target_file = reference_base_path + ".rev.1.bt2" needs_rebuild = utils.target_needs_rebuild([reference_file_path], target_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Bowtie index %s is already freshly built. Use the -f option to force a rebuild." % target_file) else: version_str = utils.extract_version_str("bowtie2", "bowtie2 --version") bowtie2_build_extra_params = os.environ.get( "Bowtie2Build_ExtraParams") or "" command_line = "bowtie2-build " + bowtie2_build_extra_params + ' ' + reference_file_path + ' ' + reference_base_path verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.global_error_on_missing_file(target_file, "bowtie2-build") elif snp_pipeline_aligner == "smalt": target_file = reference_base_path + ".smi" needs_rebuild = utils.target_needs_rebuild([reference_file_path], target_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Smalt index %s is already freshly built. Use the -f option to force a rebuild." % target_file) else: version_str = utils.extract_version_str("smalt", "smalt version") smalt_index_extra_params = os.environ.get( "SmaltIndex_ExtraParams") or "" command_line = "smalt index " + smalt_index_extra_params + ' ' + reference_base_path + ' ' + reference_file_path verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) # Create the samtools fai index verbose_print("") target_file = reference_file_path + ".fai" needs_rebuild = utils.target_needs_rebuild([reference_file_path], target_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# SAMtools fai index %s is already freshly built. Use the -f option to force a rebuild." % target_file) else: version_str = utils.extract_version_str("samtools", "samtools 2>&1 > /dev/null") samtools_faidx_extra_params = os.environ.get( "SamtoolsFaidx_ExtraParams") or "" command_line = "samtools faidx " + samtools_faidx_extra_params + ' ' + reference_file_path verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.global_error_on_missing_file(target_file, "samtools faidx") # Create the reference dict file used later by GATK enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true" if enable_local_realignment: verbose_print("") target_file = reference_base_path + ".dict" needs_rebuild = utils.target_needs_rebuild([reference_file_path], target_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Sequence dictionary %s is already freshly built. Use the -f option to force a rebuild." % target_file) else: utils.remove_file( target_file ) # Need to delete existing output, if any, before running jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH") if not jar_file_path: utils.global_error( "Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable." ) version_str = utils.extract_version_str( "Picard", "java -jar " + jar_file_path + " CreateSequenceDictionary --version 2>&1") picard_jvm_extra_params = os.environ.get( "PicardJvm_ExtraParams") or "" picard_create_sequence_dictionary_extra_params = os.environ.get( "CreateSequenceDictionary_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") tmp_option = " TMP_DIR=" + tmpdir if tmpdir else "" command_line = "java " + picard_jvm_extra_params + " -jar " + jar_file_path + " CreateSequenceDictionary REFERENCE=" + reference_file_path + " OUTPUT=" + target_file + tmp_option + ' ' + picard_create_sequence_dictionary_extra_params verbose_print("# Create reference sequence dictionary.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file( target_file, "picard CreateSequenceDictionary")
def call_consensus(options_dict): """Call the consensus base for a sample Call the consensus base for a sample at the positions where SNPs were found in any of the samples. This function expects, or creates '(*)', the following files arranged in the following way: snplist.txt samples sample_name_one/reads.all.pileup sample_name_one/consensus.fasta (*) The files are used as follows: 1. The snplist.txt input file contains the list of SNP positions extracted from all the var.flt.vcf files combined. 2. The reads.all.pileup input file is a pileups at all positions used to determine the nucleotide base at each SNP position. 3. The consensus.fasta output file contains the SNP calls for each sequence, arranged as a fasta file with one sequence per sample. The snplist.txt, and reads.snp.pileup are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: forceFlag : boolean flag to force processing even when result file already exists and is newer than inputs snpListFile : str File path (not just file name) of text format list of SNP positions allPileupFile : str Relative or absolute path to the genome-wide pileup file for this sample consensusFile : str Output file. Relative or absolute path to the consensus fasta file for this sample. minBaseQual : int Mimimum base quality score to count a read. All other snp filters take effect after the low-quality reads are discarded. minConsFreq : float Consensus frequency. Mimimum fraction of high-quality reads supporting the consensus to make a call. minConsStrdDpth : int Consensus strand depth. Minimum number of high-quality reads supporting the consensus which must be present on both the forward and reverse strands to make a call. minConsStrdBias : float Strand bias. Minimum fraction of the high-quality consensus-supporting reads which must be present on both the forward and reverse strands to make a call. The numerator of this fraction is the number of high-quality consensus-supporting reads on one strand. The denominator is the total number of high-quality consensus-supporting reads on both strands combined. Raises: Examples: options_dict = {'snpListFile':'snplist.txt', 'allPileupFile':'reads.all.pileup', 'consensusFile':'consensus.fasta', 'minBaseQual':15, 'minConsFreq':0.6, 'minConsStrdDpth':4, 'minConsStrdBias':0.10, 'vcfFailedSnpGt':'.' } call_consensus(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) snp_list_file_path = options_dict['snpListFile'] all_pileup_file_path = options_dict['allPileupFile'] sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path)) sample_name = os.path.basename(sample_directory) consensus_file_path = options_dict['consensusFile'] consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path)) vcf_file_name = options_dict['vcfFileName'] vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path]) if bad_file_count > 0: utils.global_error("Error: cannot call consensus without the snplist file.") bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path]) if bad_file_count > 0: utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False) # Check if the result is already fresh source_files = [snp_list_file_path, all_pileup_file_path] if not options_dict['forceFlag'] and not utils.target_needs_rebuild(source_files, consensus_file_path): verbose_print("Consensus call file %s has already been freshly built. Use the -f option to force a rebuild." % consensus_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name())) return # Load the list of which positions to called snp_list = utils.read_snp_position_list(snp_list_file_path) snplist_length = len(snp_list) verbose_print("snp position list length = %d" % snplist_length) # Call consensus. Write results to file. position_consensus_base_dict = dict() caller = pileup.ConsensusCaller(options_dict['minConsFreq'], options_dict['minConsStrdDpth'], options_dict['minConsStrdBias']) snp_positions = set(snp_list) parse_positions = None if options_dict['vcfAllPos'] else snp_positions pileup_reader = pileup.Reader(all_pileup_file_path, options_dict['minBaseQual'], parse_positions) if vcf_file_name: writer = vcf_writer.SingleSampleWriter(vcf_file_path, options_dict['vcfPreserveRefCase']) filters = caller.get_filter_descriptions() writer.write_header(sample_name, filters, options_dict['vcfRefName']) for pileup_record in pileup_reader: chrom = pileup_record.chrom pos = pileup_record.position consensus_base, fail_reasons = caller.call_consensus(pileup_record) if (chrom, pos) in snp_positions: if fail_reasons: position_consensus_base_dict[(chrom, pos)] = '-' else: position_consensus_base_dict[(chrom, pos)] = consensus_base if vcf_file_name: writer.write_from_pileup(pileup_record, fail_reasons, options_dict['vcfFailedSnpGt']) if vcf_file_name: writer.close() verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict))) consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list] consensus_str = ''.join(consensus_list) snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="") # Write the consensus calls to a fasta file with open(consensus_file_path, "w") as fasta_file_object: SeqIO.write([snp_seq_record], fasta_file_object, "fasta") verbose_print("") verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sam sample_name_one/reads.unsorted.bam* sample_name_one/reads.sorted.bam* sample_name_one/reads.sorted.deduped.bam* sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir sam_file = os.path.join(sample_dir, "reads.sam") utils.verify_non_empty_input_files("Sample SAM file", [sam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Convert sam to bam file, selecting only the mapped reads #========================================================================== # Check for fresh bam file; if not, convert to bam file with only mapped reads unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam") needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Unsorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") # Substitute the default parameters if the user did not specify samtools view parameters samtools_samfilter_params = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4" command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file verbose_print("# Convert sam file to bam file with only mapped positions.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view") verbose_print("") #========================================================================== # Sort the BAM file #========================================================================== # Check for fresh sorted bam file; if not, sort it sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam") needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Sorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_sort_extra_params = os.environ.get("SamtoolsSort_ExtraParams") or "" # Inspect the samtools version to determine how to execute samtools # Use the -o FILE command line option with SAMtools 1.3 and higher samtools_version = version_str.split()[-1] # just the number if samtools_version < "1.3": command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(sample_dir, "reads.sorted") else: command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file verbose_print("# Convert bam to sorted bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort") verbose_print("") #========================================================================== # Mark duplicate reads, so they will be ignored in subsequent steps #========================================================================== remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": # Check for fresh deduped bam file; if not, remove duplicate reads deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam") needs_rebuild = utils.target_needs_rebuild([sorted_bam_file], deduped_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Deduped bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "picard" not in classpath.lower(): utils.global_error("Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable.") else: version_str = utils.extract_version_str("Picard", "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1") picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or "" picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") tmp_option = " TMP_DIR=" + tmpdir if tmpdir else "" command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params verbose_print("# Remove duplicate reads from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(deduped_bam_file, "picard MarkDuplicates") verbose_print("") pileup_input_file = deduped_bam_file else: pileup_input_file = sorted_bam_file #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild([pileup_input_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print("# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "varscan" not in classpath.lower(): utils.global_error("Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable.") else: version_str = utils.extract_version_str("VarScan", "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2") varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
def index_ref(args): """Index the reference genome. Execute an external program (bowtie2 or smalt) to create an index for the reference genome to be used during subsequent alignment. Execute samtools to create the faidx index file to be used during subsequent pileups. The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta # input fasta referenceFile.#.bt2* # bowtie2 output referenceFile.rev.#.bt2* # bowtie2 output referenceFile.sma* # smalt output referenceFile.smi* # smalt output referenceFile.fasta.fai* # samtools faidx output The input fasta file is created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") reference_base_path = os.path.splitext(reference_file_path)[ 0] # strip the file extension # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2" snp_pipeline_aligner = snp_pipeline_aligner.lower() if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.global_error( "Error: only bowtie2 and smalt aligners are supported.") # Create index file for reference if snp_pipeline_aligner == "bowtie2": target_file = reference_base_path + ".rev.1.bt2" needs_rebuild = utils.target_needs_rebuild([reference_file_path], target_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Bowtie index %s is already freshly built. Use the -f option to force a rebuild." % target_file) else: version_str = utils.extract_version_str("bowtie2", "bowtie2 --version") bowtie2_build_extra_params = os.environ.get( "Bowtie2Build_ExtraParams") or "" command_line = "bowtie2-build " + bowtie2_build_extra_params + ' ' + reference_file_path + ' ' + reference_base_path verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) elif snp_pipeline_aligner == "smalt": target_file = reference_base_path + ".smi" needs_rebuild = utils.target_needs_rebuild([reference_file_path], target_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Smalt index %s is already freshly built. Use the -f option to force a rebuild." % target_file) else: version_str = utils.extract_version_str("smalt", "smalt version") smalt_index_extra_params = os.environ.get( "SmaltIndex_ExtraParams") or "" command_line = "smalt index " + smalt_index_extra_params + ' ' + reference_base_path + ' ' + reference_file_path verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) # Create the samtools fai index verbose_print("") target_file = reference_file_path + ".fai" needs_rebuild = utils.target_needs_rebuild([reference_file_path], target_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# SAMtools fai index %s is already freshly built. Use the -f option to force a rebuild." % target_file) else: version_str = utils.extract_version_str("samtools", "samtools 2>&1 > /dev/null") samtools_faidx_extra_params = os.environ.get( "SamtoolsFaidx_ExtraParams") or "" command_line = "samtools faidx " + samtools_faidx_extra_params + ' ' + reference_file_path verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.global_error_on_missing_file(target_file, "samtools faidx")
def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sorted.deduped.indelrealigned.bam sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true" enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true" input_bam_file = os.path.join(sample_dir, "reads.sorted.bam") input_bam_file = utils.add_file_suffix(input_bam_file, ".deduped", enable=remove_duplicate_reads) input_bam_file = utils.add_file_suffix(input_bam_file, ".indelrealigned", enable=enable_local_realignment) utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild([input_bam_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print("# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH") if not jar_file_path: utils.global_error("Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable.") else: version_str = utils.extract_version_str("VarScan", "java -jar " + jar_file_path + " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2") varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Calculate number of reads, %mapped, %proper pair, and ave insert size from sam file" )) #------------------------- num_reads = "" percent_reads_mapped = "" percent_proper_pair = "" ave_insert_size = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get( "percentReadsMapped", "") # reuse already fresh metrics percent_proper_pair = metrics.get( "percentProperPair", "") # reuse already fresh metrics ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if not missing_any_metrics: verbose_print( "Reusing previously calculated number of reads, %mapped, %proper pair, and ave insert size" ) else: tempfile_path = os.path.join(sample_dir, "tmp.sam.stats") try: command.run("samtools stats " + file, tempfile_path) except subprocess.CalledProcessError: pass # the error message has already been printed to stderr with open(tempfile_path) as f: for line in f: lower_line = line.lower() split_line = line.strip().split('\t') if "raw total sequences:" in lower_line: num_reads = split_line[2] continue if "reads mapped:" in lower_line: reads_mapped = split_line[2] try: percent_reads_mapped = 100.0 * float( reads_mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: percent_reads_mapped = "" continue if "reads properly paired:" in lower_line: proper_pairs = split_line[2] try: percent_proper_pair = 100.0 * float( proper_pairs) / float(num_reads) percent_proper_pair = "%.2f" % percent_proper_pair except ValueError: percent_proper_pair = "" continue if "insert size average:" in lower_line: ave_insert_size = split_line[2] continue os.unlink(tempfile_path) missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if missing_any_metrics: missing_list = [] if not num_reads: missing_list.append("number of reads") if not percent_reads_mapped: missing_list.append("percent reads mapped") if not percent_proper_pair: missing_list.append("percent proper pair") if not ave_insert_size: missing_list.append("ave insert size") error_text = "Cannot calculate " + ", ".join( missing_list) + '.' handle_error(error_text) #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print( "# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print( "Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file" )) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get( "phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print( "Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get( "snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print( "Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print( "# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print( "Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions( file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get( "missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print( "Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions( file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("percentProperPair=" + percent_proper_pair, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path): """Detect abnormal regions in each sample and filter those regions from all samples. Parameters ---------- list_of_vcf_files : list of str List of input VCF file paths -- one per sample. contig_length_dict : dict, str --> int Mapping of contig id to int length of contig. sorted_list_of_outgroup_samples : list of str List of sample IDs for samples that are outgroup samples. force_flag : bool Force processing even when result files already exist and are newer than inputs. edge_length : int The length of the edge regions in a contig, in which all SNPs will be removed. window_size_list : list of int The length of the window in which the number of SNPs should be no more than max_num_snp. max_num_snps_list : list of int The maximum number of SNPs allowed in a window. This list has the same size as window_size_list and the entries correspond to one another. ref_fasta_path : str Path to the reference fasta file. out_group_list_path : str Path to the file indicating outgroup samples, one sample ID per line. """ #========================================================================== # Prep work #========================================================================== input_file_list = list() input_file_list.append(ref_fasta_path) if out_group_list_path: input_file_list.append(out_group_list_path) #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild only for that sample. # A missing output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" input_files = input_file_list + [vcf_file_path] preserved_needs_rebuild = utils.target_needs_rebuild( input_files, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild( input_files, removed_vcf_file_path) need_rebuild_dict[ vcf_file_path] = force_flag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print( "All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild." ) return #========================================================================== # Find all bad regions in one sample at a time #========================================================================== for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue sample_ID = utils.sample_id_from_file(vcf_file_path) utils.verbose_print("Processing sample %s" % sample_ID) if sample_ID in sorted_list_of_outgroup_samples: write_outgroup_preserved_and_removed_vcf_files( vcf_file_path, vcf_reader) else: # The bad_regions_dict holds the bad regions for this sample # Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position). bad_regions_dict = dict() collect_dense_regions(vcf_reader, bad_regions_dict, contig_length_dict, edge_length, max_num_snps_list, window_size_list) # Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): combined_regions = utils.merge_regions(regions) bad_regions_dict[contig] = combined_regions # Write the output files write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict) vcf_reader_handle.close()
def create_snp_reference_seq(args): """Write reference sequence bases at SNP locations to a fasta file. Write reference sequence bases at SNP locations to a fasta file. This function expects, or creates '(*)', the following files: reference.fasta snplist.txt referenceSNP.fasta (*) The files are used as follows: 1. The reference.fasta input file contains the whole-genome reference bases. 2. The snplist.txt input file contains the list of SNP positions across all the samples. 2. The referenceSNP.fasta output file contains the reference bases at the identified SNP locations. The snplist.txt file is created outside of this function. The package documentation provides an example of creating this file based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : Namespace referenceFile: File path (not just file name) for reference sequence in fasta format snpListFile: File path (not just file name) of text format list of SNP positions snpRefFile: File path (not just file name) for the SNP reference sequence file. Raises: Examples: args = argparse.Namespace args.referenceFile = 'reference.fasta' args.snpListFile = 'snplist.txt' args.snpRefFile = 'referenceSNP.fasta' create_snp_reference_seq(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Write reference sequence bases at SNP locations to a fasta file. #========================================================================== reference_file = args.referenceFile snp_list_file_path = args.snpListFile snp_ref_seq_path = args.snpRefFile #========================================================================== # Verify input files exist #========================================================================== bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path]) if bad_file_count > 0: utils.global_error( "Error: cannot create the snp reference sequence without the snplist file." ) bad_file_count = utils.verify_non_empty_input_files( "Reference file", [reference_file]) if bad_file_count > 0: utils.global_error( "Error: cannot create the snp reference sequence without the reference fasta file." ) #========================================================================== # Find the reference bases at the snp positions #========================================================================== source_files = [reference_file, snp_list_file_path] if args.forceFlag or utils.target_needs_rebuild(source_files, snp_ref_seq_path): utils.write_reference_snp_file(reference_file, snp_list_file_path, snp_ref_seq_path) else: verbose_print( "SNP reference sequence %s has already been freshly built. Use the -f option to force a rebuild." % snp_ref_seq_path)
def filter_regions(args): """Remove bad SNPs from original vcf files Remove bad SNPs -- this function finds bad regions, including the edges and probable prophage regions; then remove SNPs in these regions in original vcf files of all samples. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf sample_name_one/var.flt_removed.vcf (*) sample_name_one/var.flt_preserved.vcf (*) ... The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files (i.e., the original vcf file). 3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and preserved SNPs. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories refFastaFile: File path (not just file name) of reference fasta file edgeLength: the length of edge of a contig in which SNPs will be removed. Default is 500. windowSize: the size of the window in which max number of SNPs are allowed. Default is 1000. maxSNP: the maximum number of SNPs allowed in a window of a size defined in windowSize. Default is 3. Raises: Examples: args = argparse.Namespace args.sampleDirsFile = 'sampleDirectories.txt' args.vcfFileName = 'var.flt.vcf' args.refFastaFile = 'snplist.txt' remove_bad_snp(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate some parameters #========================================================================== edge_length = args.edgeLength window_size = args.windowSize max_num_snp = args.maxSNP #========================================================================== # Prep work #========================================================================== sample_directories_list_path = args.sampleDirsFile bad_file_count = utils.verify_non_empty_input_files( "File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [ line.rstrip() for line in sample_directories_list_file ] unsorted_list_of_sample_directories = [ d for d in unsorted_list_of_sample_directories if d ] sorted_list_of_sample_directories = sorted( unsorted_list_of_sample_directories) input_file_list = list() out_group_list_path = args.outGroupFile sorted_list_of_outgroup_samples = list() if out_group_list_path is not None: bad_file_count = utils.verify_non_empty_input_files( "File of outgroup samples", [out_group_list_path]) if bad_file_count > 0: utils.global_error(None) try: #There are outgroup samples input_file_list.append(out_group_list_path) with open(out_group_list_path, "r") as out_group_list_file: unsorted_list_of_outgroup_samples = [ line.rstrip() for line in out_group_list_file ] sorted_list_of_outgroup_samples = sorted( unsorted_list_of_outgroup_samples) except: utils.global_error( "Error: Cannot open the file containing the list of outgroup samples!" ) #========================================================================== # Validate inputs #========================================================================== vcf_file_name = args.vcfFileName list_of_vcf_files = [ os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories ] input_file_list.extend(list_of_vcf_files) bad_file_count = utils.verify_non_empty_input_files( "VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) bad_file_count = utils.verify_non_empty_input_files( "Reference file", [args.refFastaFile]) if bad_file_count > 0: utils.global_error(None) #========================================================================== # Get contigs' length from the reference fasta file #========================================================================== try: handle = open(args.refFastaFile, "r") contig_length_dict = dict() for record in SeqIO.parse(handle, "fasta"): #build contig_length_dict contig_length_dict[record.id] = len(record.seq) input_file_list.append(args.refFastaFile) except: utils.global_error( "Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file." ) else: if handle: handle.close() #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild for all samples because # the bad regions are combined across all samples. However, a missing # output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" preserved_needs_rebuild = utils.target_needs_rebuild( input_file_list, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild( input_file_list, removed_vcf_file_path) need_rebuild_dict[ vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print( "All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild." ) return #========================================================================== # Find all bad regions. #========================================================================== bad_regions_dict = dict( ) # Key is the contig ID, and the value is a list of bad regions. for vcf_file_path in list_of_vcf_files: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID in sorted_list_of_outgroup_samples: if not need_rebuild_dict[vcf_file_path]: vcf_reader_handle.close() continue #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer( open(removed_vcf_file_path, 'w'), vcf_reader) except: #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue vcf_writer_removed.close() vcf_reader_handle.close() shutil.copyfile(vcf_file_path, preserved_vcf_file_path) else: #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. snp_dict = defaultdict(list) for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. record = (vcf_data_line.POS, vcf_data_line) snp_dict[vcf_data_line.CHROM].append(record) #Find bad regions and add them into bad_region for contig, snp_list in snp_dict.items(): #sort all SNPs in this contig by position sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0]) #total number of SNPs num_of_snp = len(sorted_list) if contig not in bad_regions_dict: #New contig try: contig_length = contig_length_dict[contig] except: #cannot find contig length. Use the sys.maxsize. contig_length = sys.maxsize if (contig_length <= (edge_length * 2)): bad_regions_dict[contig] = [(0, contig_length)] else: region = [(0, edge_length), (contig_length - edge_length, contig_length)] bad_regions_dict[contig] = region #Process SNPs for idx, snp in enumerate(sorted_list): if (idx + max_num_snp) < num_of_snp: pos_start = snp[0] pos_end = sorted_list[idx + max_num_snp][0] if (pos_start + window_size) >= pos_end: #Add bad region regions = bad_regions_dict[contig] temp_region = (pos_start, pos_end) regions.append(temp_region) vcf_reader_handle.close() #Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): sorted_regions = utils.sort_coord(regions) combined_regions = utils.consensus(sorted_regions) bad_regions_dict[contig] = combined_regions #Scan vcf files to remove SNPs for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID not in sorted_list_of_outgroup_samples: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error( "Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_preserved = None vcf_writer_preserved = vcf.Writer( open(preserved_vcf_file_path, 'w'), vcf_reader) except: if vcf_writer_preserved is not None: vcf_writer_preserved.close() os.remove(preserved_vcf_file_path) vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True) continue try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer( open(removed_vcf_file_path, 'w'), vcf_reader) except: #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_writer_preserved.close() vcf_reader_handle.close() utils.sample_error( "Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. contig = vcf_data_line.CHROM if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]): #Remove this SNP vcf_writer_removed.write_record(vcf_data_line) else: #Preserve this SNP vcf_writer_preserved.write_record(vcf_data_line) vcf_writer_preserved.close() vcf_writer_removed.close() vcf_reader_handle.close()
def call_consensus(args): """Call the consensus base for a sample Call the consensus base for a sample at the positions where SNPs were found in any of the samples. This function expects, or creates '(*)', the following files arranged in the following way: snplist.txt samples sample_name_one/reads.all.pileup sample_name_one/consensus.fasta (*) The files are used as follows: 1. The snplist.txt input file contains the list of SNP positions extracted from all the var.flt.vcf files combined. 2. The reads.all.pileup input file is a pileups at all positions used to determine the nucleotide base at each SNP position. 3. The consensus.fasta output file contains the SNP calls for each sequence, arranged as a fasta file with one sequence per sample. The snplist.txt, and reads.all.pileup are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : namespace forceFlag : boolean flag to force processing even when result file already exists and is newer than inputs snpListFile : str File path (not just file name) of text format list of SNP positions excludeFile : str File path of VCF file of positions to exclude from the snp matrix. allPileupFile : str Relative or absolute path to the genome-wide pileup file for this sample consensusFile : str Output file. Relative or absolute path to the consensus fasta file for this sample. minBaseQual : int Mimimum base quality score to count a read. All other snp filters take effect after the low-quality reads are discarded. minConsFreq : float Consensus frequency. Mimimum fraction of high-quality reads supporting the consensus to make a call. minConsStrdDpth : int Consensus strand depth. Minimum number of high-quality reads supporting the consensus which must be present on both the forward and reverse strands to make a call. minConsStrdBias : float Strand bias. Minimum fraction of the high-quality consensus-supporting reads which must be present on both the forward and reverse strands to make a call. The numerator of this fraction is the number of high-quality consensus-supporting reads on one strand. The denominator is the total number of high-quality consensus-supporting reads on both strands combined. Raises: Examples: args = argparse.Namespace args.snpListFile = 'snplist.txt' args.allPileupFile = 'reads.all.pileup' args.consensusFile = 'consensus.fasta' args.minBaseQual = 15 args.minConsFreq = 0.6 args.minConsStrdDpth = 4 args.minConsStrdBias = 0.10 args.vcfFailedSnpGt = '.' call_consensus(args) """ utils.print_log_header() utils.print_arguments(args) snp_list_file_path = args.snpListFile all_pileup_file_path = args.allPileupFile sample_directory = os.path.dirname(os.path.abspath(all_pileup_file_path)) sample_name = os.path.basename(sample_directory) consensus_file_path = args.consensusFile consensus_file_dir = os.path.dirname(os.path.abspath(consensus_file_path)) vcf_file_name = args.vcfFileName vcf_file_path = os.path.join(consensus_file_dir, vcf_file_name) if vcf_file_name else None bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path]) if bad_file_count > 0: utils.global_error("Error: cannot call consensus without the snplist file.") bad_file_count = utils.verify_non_empty_input_files("Pileup file", [all_pileup_file_path]) if bad_file_count > 0: utils.sample_error("Error: cannot call consensus without the pileup file.", continue_possible=False) source_files = [snp_list_file_path, all_pileup_file_path] exclude_file_path = args.excludeFile if exclude_file_path: bad_file_count = utils.verify_existing_input_files("Exclude file", [exclude_file_path]) if bad_file_count > 0: utils.sample_error("Error: cannot call consensus without the file of excluded positions.", continue_possible=False) excluded_positions = utils.convert_vcf_file_to_snp_set(exclude_file_path) source_files.append(exclude_file_path) else: excluded_positions = set() # Check if the result is already fresh if not args.forceFlag and not utils.target_needs_rebuild(source_files, consensus_file_path): utils.verbose_print("Consensus call file %s has already been freshly built. Use the -f option to force a rebuild." % consensus_file_path) return # Load the list of which positions to called snp_list = utils.read_snp_position_list(snp_list_file_path) snplist_length = len(snp_list) utils.verbose_print("snp position list length = %d" % snplist_length) utils.verbose_print("excluded snps list length = %d" % len(excluded_positions)) utils.verbose_print("total snp position list length = %d" % (snplist_length + len(excluded_positions))) # Call consensus. Write results to file. position_consensus_base_dict = dict() caller = pileup.ConsensusCaller(args.minConsFreq, args.minConsStrdDpth, args.minConsStrdBias) snp_positions = set(snp_list) if args.vcfAllPos: parse_positions = None else: parse_positions = snp_positions.union(excluded_positions) pileup_reader = pileup.Reader(all_pileup_file_path, args.minBaseQual, parse_positions) if vcf_file_name: writer = vcf_writer.SingleSampleWriter(vcf_file_path, args.vcfPreserveRefCase) filters = caller.get_filter_descriptions() # TODO: it would be better if the exclude file contained filter headers we could read and re-use here instead of hard-coding this filters.append(("Region", "Position is in dense region of snps or near the end of the contig.")) writer.write_header(sample_name, filters, args.vcfRefName) for pileup_record in pileup_reader: chrom = pileup_record.chrom pos = pileup_record.position consensus_base, fail_reasons = caller.call_consensus(pileup_record) if (chrom, pos) in excluded_positions: # TODO: it would be better if the exclude file contained filter reasons we could re-use here instead of hard coding this fail_reasons = fail_reasons or [] fail_reasons.append("Region") if (chrom, pos) in snp_positions: if fail_reasons: position_consensus_base_dict[(chrom, pos)] = '-' else: position_consensus_base_dict[(chrom, pos)] = consensus_base if vcf_file_name: writer.write_from_pileup(pileup_record, fail_reasons, args.vcfFailedSnpGt) if vcf_file_name: writer.close() utils.verbose_print("called consensus positions = %i" % (len(position_consensus_base_dict))) consensus_list = [position_consensus_base_dict.get(key, '-') for key in snp_list] consensus_str = ''.join(consensus_list) snp_seq_record = SeqRecord(Seq(consensus_str), id=sample_name, description="") # Write the consensus calls to a fasta file with open(consensus_file_path, "w") as fasta_file_object: SeqIO.write([snp_seq_record], fasta_file_object, "fasta")
def map_reads(args): """Align reads to the reference. Execute an external program (bowtie2 or smalt) to map the fastq reads to a reference file. The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/sampleFastqFile_1.fastq sample_name_one/sampleFastqFile_2.fastq sample_name_one/reads.sam* The reverse fastq file is optional. The fastq files may be either compressed with gzip or uncompressed. The reverse fastq file is optional. All the input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleFastqFile1 : File path of the forward fastq file sampleFastqFile2 : Optional file path of the reverse fastq file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") # Verify fastq files exist and are not empty sample_fastq_file1 = args.sampleFastqFile1 sample_fastq_file2 = args.sampleFastqFile2 fastq_files = [sample_fastq_file1] if sample_fastq_file2: fastq_files.append(sample_fastq_file2) utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample") # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2" snp_pipeline_aligner = snp_pipeline_aligner.lower() if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.global_error( "Error: only bowtie2 and smalt aligners are supported.") sample_dir = os.path.dirname(sample_fastq_file1) sample_id = utils.sample_id_from_file(sample_fastq_file1) reference_base_path = os.path.splitext(reference_file_path)[ 0] # strip the file extension reference_id = os.path.basename(reference_base_path) #========================================================================== # Check if alignment to reference has already been done #========================================================================== sam_file = os.path.join(sample_dir, "reads.sam") source_files = [sample_fastq_file1] if sample_fastq_file2: source_files.append(sample_fastq_file2) if snp_pipeline_aligner == "bowtie2": source_files.append(reference_base_path + ".rev.1.bt2") elif snp_pipeline_aligner == "smalt": source_files.append(reference_base_path + ".smi") needs_rebuild = utils.target_needs_rebuild(source_files, sam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# %s has already been aligned to %s. Use the -f option to force a rebuild." % (sample_id, reference_id)) return #========================================================================== # Construct the command line to execute bowtie2 or smalt #========================================================================== # The read group identifies reads from a single run and lane read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1, sample_id) # Default to 8 cores on HPC or all cpu cores on workstation if os.environ.get("JOB_ID") or os.environ.get("PBS_JOBID"): num_cores = 8 else: num_cores = psutil.cpu_count() num_cores_param = "" if snp_pipeline_aligner == "bowtie2": version_str = utils.extract_version_str("bowtie2", "bowtie2 --version") # Parse the user-specified bowtie parameters to determine if the user specified the number of CPU cores bowtie2_align_extra_params = os.environ.get( "Bowtie2Align_ExtraParams") or "" if not utils.detect_numeric_option_in_parameters_str( bowtie2_align_extra_params, "-p"): num_cores_param = "-p " + str(num_cores) # Specify the read group and sample tags here, --rg tags cannot be specified without ID. # The read group tags are used by some downstream tools, like Picard and GATK. read_group_params = "" if read_group_tags: read_group_params += " --rg-id " + read_group_tags.ID read_group_params += " --rg SM:" + read_group_tags.SM read_group_params += " --rg LB:" + read_group_tags.LB read_group_params += " --rg PL:" + read_group_tags.PL read_group_params += " --rg PU:" + read_group_tags.PU # Substitute the default parameters if the user did not specify bowtie parameters bowtie2_align_params = bowtie2_align_extra_params or "--reorder -q" # Build the command with options depending on whether the fastq files are paired command_line = "bowtie2 " + num_cores_param + " " + read_group_params + " " + bowtie2_align_params + " -x " + reference_base_path if sample_fastq_file2: command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2 else: command_line += " -U " + sample_fastq_file1 elif snp_pipeline_aligner == "smalt": version_str = utils.extract_version_str("smalt", "smalt version") # Parse the user-specified smalt parameters to determine if the user specified the number of CPU cores smalt_align_extra_params = os.environ.get( "SmaltAlign_ExtraParams") or "" if not utils.detect_numeric_option_in_parameters_str( smalt_align_extra_params, "-n"): num_cores_param = "-n " + str(num_cores) # Substitute the default parameters if the user did not specify smalt parameters smalt_align_params = smalt_align_extra_params or "-O" # Don't use the -i 1000 option if the fastq file is unpaired if not sample_fastq_file2: smalt_align_params = re.sub( "-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute command_line = "smalt map " + num_cores_param + " " + smalt_align_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + ( sample_fastq_file2 or "") #========================================================================== # Run the command to execute bowtie2 or smalt #========================================================================== verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sam_file) #========================================================================== # When using smalt, assign read groups in a separate step. # This is already done when using bowtie2. #========================================================================== if snp_pipeline_aligner == "smalt" and read_group_tags: smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam") shutil.move(sam_file, smalt_sam_file) version_str = utils.extract_version_str( "Picard", "java picard.cmdline.PicardCommandLine AddOrReplaceReadGroups --version 2>&1" ) jvm_params = os.environ.get("PicardJvm_ExtraParams") or "" command_line = "java " + jvm_params + " picard.cmdline.PicardCommandLine AddOrReplaceReadGroups" command_line += " I=" + smalt_sam_file command_line += " O=" + sam_file command_line += " RGID=" + read_group_tags.ID command_line += " RGSM=" + read_group_tags.SM command_line += " RGLB=" + read_group_tags.LB command_line += " RGPL=" + read_group_tags.PL command_line += " RGPU=" + read_group_tags.PU verbose_print("") verbose_print("# Assign read group id %s" % (read_group_tags.ID)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout)
def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of reads and %mapped from sam file")) #------------------------- num_reads = "" percent_reads_mapped = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get("percentReadsMapped", "") # reuse already fresh metrics if num_reads and percent_reads_mapped: verbose_print("Reusing previously calculated number of reads and %mapped") else: num_reads = command.run("samtools view -S -c " + file) num_reads = num_reads.strip() mapped = command.run("samtools view -S -c -F 4 " + file) mapped = mapped.strip() try: percent_reads_mapped = 100.0 * float(mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: handle_error("Cannot calculate number of reads and %mapped.") #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print("Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean insert size from bam file")) #------------------------- ave_insert_size = "" file = os.path.join(sample_dir, "reads.sorted.bam") if verify_input_file("BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics if ave_insert_size: verbose_print("Reusing previously calculated mean insert size") else: # Extract inferred insert sizes (TLEN, column 9 of BAM file) for reads "mapped in proper pair" (2) and "first in pair" (64) = 66 tempfile = NamedTemporaryFile(delete=False, dir=sample_dir, prefix="tmp.inserts.", mode='w') command.run("samtools view -f 66 " + file + " | cut -f 9 | sed 's/^-//'", tempfile.name) insert_count = 0 insert_sum = 0 with open(tempfile.name) as f: for line in f: try: insert_sum += int(line) insert_count += 1 except ValueError: pass os.unlink(tempfile.name) if insert_count > 0 and insert_sum > 0: ave_insert_size = float(insert_sum) / float(insert_count) ave_insert_size = "%.2f" % ave_insert_size else: handle_error("Cannot calculate mean insert size.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get("phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print("Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get("snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print("Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print("Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions(file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get("missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print("Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions(file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def merge_vcfs(args): """Merge the per-sample VCF files. Execute an external program (bcftools merge)) to merge the VCF files. This function expects, or creates '(*)', the following files arranged in the following way: samples sample_name_one/consensus.vcf snpma.vcf* All the input files are created outside of this function. Before running this command, the vcf file for each sample must be created by the call_consensus.py script. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace sampleDirsFile : Path to file containing a list of directories -- one per sample vcfFileName : File name of the vcf files which must exist in each of the sample directories mergedVcfFile : Path to the output merged multi-vcf file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== sample_directories_list_path = args.sampleDirsFile vcf_file_name = args.vcfFileName merged_vcf_file = args.mergedVcfFile utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path], error_handler="global") with open(sample_directories_list_path, "r") as f: sample_directories = [line.rstrip() for line in f] sample_directories = [d for d in sample_directories if d] vcf_files = [os.path.join(d, vcf_file_name) for d in sample_directories] good_vcf_files = [] for vcf_file in vcf_files: bad = utils.verify_non_empty_input_files("Sample vcf file", [vcf_file], error_handler="sample", continue_possible=True) if not bad: good_vcf_files.append(vcf_file) if len(good_vcf_files) == 0: utils.global_error("There are no vcf files to merge.") #========================================================================== # Check if merge has already been done #========================================================================== needs_rebuild = utils.target_needs_rebuild(vcf_files, merged_vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Multi-VCF file is already freshly created. Use the -f option to force a rebuild.") return #========================================================================== # Copy, Compress, Index, Merge #========================================================================== # If there is only one good sample, just copy the consensus VCF file to the snpma.vcf file if len(good_vcf_files) == 1: shutil.copy(good_vcf_files[0], merged_vcf_file) return # Copy single VCF files to a common directory where the files will be edited verbose_print("# %s Copying VCF files to temp directory" % utils.timestamp()) parent_of_temp_dir = os.path.dirname(merged_vcf_file) temp_dir = tempfile.mkdtemp(prefix="tmp.vcf.", dir=parent_of_temp_dir) file_copies = [] for d in sample_directories: src_file = os.path.join(d, vcf_file_name) if src_file in good_vcf_files: dst_file = os.path.join(temp_dir, os.path.basename(d) + ".vcf") file_copies.append(dst_file) verbose_print("copy %s %s" % (src_file, dst_file)) #if not os.path.isfile(dst_file) or os.stat(src_file).st_mtime > os.stat(dst_file).st_mtime: shutil.copy2(src_file, dst_file) # bgzip all the sample vcf files verbose_print("# %s Compressing VCF files" % utils.timestamp()) for file in file_copies: verbose_print("bgzip -c %s > %s" % (file, file + ".gz")) command.run("bgzip -c " + file, file + ".gz") # Index all the zipped sample vcf file verbose_print("# %s Indexing VCF files" % utils.timestamp()) for file in file_copies: file += ".gz" verbose_print("tabix -f -p vcf " + file) command.run("tabix -f -p vcf " + file, sys.stdout) # Substitute the default parameters if the user did not specify bcftools parameters default_params = "--merge all --info-rules NS:sum" bcf_tools_extra_params = os.environ.get("BcftoolsMerge_ExtraParams") or default_params # Merge the VCFs verbose_print("# %s Merging VCF files" % utils.timestamp()) command_line = "bcftools merge -o " + merged_vcf_file + ' ' + bcf_tools_extra_params + ' ' + temp_dir + "/*.gz" verbose_print(command_line) command.run(command_line, sys.stdout) # Clean up shutil.rmtree(temp_dir)
def create_snp_pileup(options_dict): """Create the SNP pileup file for a sample. Description: Create the SNP pileup file for a sample -- the pileup file restricted to only positions where variants were found in any sample. This function expects, or creates '(*)', the following files arranged in the following way: snplist.txt samples sample_name_one/reads.all.pileup sample_name_one/reads.snp.pileup (*) ... The files are used as follows: 1. The snplist.txt input file contains the list of SNP positions extracted from the var.flt.vcf file. 2. The reads.all.pileup input file is the genome-wide pileup file for this sample. 3. The reads.snp.pileup output file is the pileup file for this sample, restricted to only positions where variants were found in any sample. The snplist.txt and reads.all.pileup files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: snpListFile: File path (not just file name) of text format list of SNP positions across all samples allPileupFile: File path (not just file name) of the whole-genome pileup file fot this sample snpPileupFile: File path (not just file name) of the snp pileup file Raises: Examples: options_dict = {'snpListFile':'snplist.txt', 'allPileupFile':'samples/SRR555888/reads.all.pileup' 'snpPileupFile':'samples/SRR555888/reads.snp.pileup' } create_snp_pileup(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) snp_list_file_path = options_dict['snpListFile'] all_pileup_file_path = options_dict['allPileupFile'] snp_pileup_file_path = options_dict['snpPileupFile'] source_files = [snp_list_file_path, all_pileup_file_path] if options_dict['forceFlag'] or utils.target_needs_rebuild(source_files, snp_pileup_file_path): # Create a pileup file with a subset of the whole-genome pileup restricted # to locations with SNPs only. snp_list = utils.read_snp_position_list(snp_list_file_path) utils.create_snp_pileup(all_pileup_file_path, snp_pileup_file_path, set(snp_list)) verbose_print("") else: verbose_print("SNP pileup %s has already been freshly built. Use the -f option to force a rebuild." % snp_pileup_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def create_snp_list(options_dict): """Create SNP list file Description: Create the SNP list -- the list of positions where variants were found and the corresponding list of samples having a variant at each position. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf ... snplist.txt (*) The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files are used to construct the SNP position list. 3. The snplist.txt output file contains the union of the SNP positions and sample names extracted from all the var.flt.vcf files. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories snpListFile: File path (not just file name) of text format list of SNP positions Raises: Examples: options_dict = {'sampleDirsFile':'sampleDirectories.txt', 'vcfFileName':'var.flt.vcf' 'snpListFile':'snplist.txt', } create_snp_list(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Prep work #========================================================================== sample_directories_list_filename = options_dict['sampleDirsFile'] bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_filename, "r") as sample_directories_list_file: list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] list_of_sample_directories = sorted([d for d in list_of_sample_directories if d]) #========================================================================== # Read in all vcf files and process into dict of SNPs passing various # criteria. Do this for each sample. Write to file. #========================================================================== snp_list_file_path = options_dict['snpListFile'] vcf_file_name = options_dict['vcfFileName'] list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in list_of_sample_directories] bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) if options_dict['forceFlag'] or utils.target_needs_rebuild(list_of_vcf_files, snp_list_file_path): snp_dict = utils.convert_vcf_files_to_snp_dict(list_of_vcf_files) verbose_print('Found %d snp positions across %d sample vcf files.' % (len(snp_dict), len(list_of_vcf_files))) utils.write_list_of_snps(snp_list_file_path, snp_dict) verbose_print("") else: verbose_print("SNP list %s has already been freshly built. Use the -f option to force a rebuild." % snp_list_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def create_snp_matrix(options_dict): """Create SNP matrix Description: Create the SNP matrix containing the consensus base for each of the samples at the positions where SNPs were found in any of the samples. The matrix contains one row per sample and one column per SNP position. Non-SNP positions are not included in the matrix. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/consensus.fasta ... snpma.fasta (*) The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The consensus.fasta input files are previously called consensus for each sample to construct the SNP matrix fasta file. 3. The snpma.fasta output file contains the SNP calls for each sequence, arranged as a multi-fasta file with one sequence per sample. The sampleDirectories.txt, and consensus.fasta are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Args: sampleDirsFile : str File path (not just file name) of file containing paths to directories containing consensus.fasta file for each sequence. snpListFile : str File path (not just file name) of text format list of SNP positions consFileName : str File name of the previously called consensus fasta files which must exist in each of the sample directories snpmaFile : str File path (not just file name) of the output snp matrix, formatted as a fasta file, with each sequence (all of identical length) corresponding to the SNPs in the correspondingly named sequence. Raises: Examples: options_dict = {'sampleDirsFile':'sampleDirectories.txt', 'consFileName':'consensus.fasta', 'snpmaFile':'snpma.fasta', 'minConsFreq':0.6, } create_snp_matrix(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Prep work #========================================================================== sample_directories_list_filename = options_dict['sampleDirsFile'] bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_filename]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_filename, "r") as sample_directories_list_file: list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] list_of_sample_directories = sorted([d for d in list_of_sample_directories if d]) #========================================================================== # Verify input consensus.fasta files exist #========================================================================== consensus_files = [] bad_file_count = 0 for sample_directory in list_of_sample_directories: consensus_file_path = os.path.join(sample_directory, options_dict['consFileName']) bad_count = utils.verify_non_empty_input_files("Consensus fasta file", [consensus_file_path]) if bad_count == 1: bad_file_count += 1 else: consensus_files.append(consensus_file_path) # keep the list of good files if bad_file_count == len(list_of_sample_directories): utils.global_error("Error: all %d consensus fasta files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d consensus fasta files were missing or empty." % bad_file_count, continue_possible=True) #========================================================================== # Check if the result is already fresh #========================================================================== snpma_file_path = options_dict['snpmaFile'] source_files = consensus_files if not options_dict['forceFlag']: if not utils.target_needs_rebuild(source_files, snpma_file_path): verbose_print("SNP matrix %s has already been freshly built. Use the -f option to force a rebuild." % snpma_file_path) verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name())) return #========================================================================== # Create snp matrix. Write results to file. #========================================================================== with open(snpma_file_path, "w") as output_file: for consensus_file_path in consensus_files: verbose_print("Merging " + consensus_file_path) with open(consensus_file_path, "r") as input_file: for line in input_file: output_file.write(line) verbose_print("") verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def filter_regions_per_sample(list_of_vcf_files, contig_length_dict, sorted_list_of_outgroup_samples, force_flag, edge_length, window_size_list, max_num_snps_list, ref_fasta_path, out_group_list_path): """Detect abnormal regions in each sample and filter those regions from all samples. Parameters ---------- list_of_vcf_files : list of str List of input VCF file paths -- one per sample. contig_length_dict : dict, str --> int Mapping of contig id to int length of contig. sorted_list_of_outgroup_samples : list of str List of sample IDs for samples that are outgroup samples. force_flag : bool Force processing even when result files already exist and are newer than inputs. edge_length : int The length of the edge regions in a contig, in which all SNPs will be removed. window_size_list : list of int The length of the window in which the number of SNPs should be no more than max_num_snp. max_num_snps_list : list of int The maximum number of SNPs allowed in a window. This list has the same size as window_size_list and the entries correspond to one another. ref_fasta_path : str Path to the reference fasta file. out_group_list_path : str Path to the file indicating outgroup samples, one sample ID per line. """ #========================================================================== # Prep work #========================================================================== input_file_list = list() input_file_list.append(ref_fasta_path) if out_group_list_path: input_file_list.append(out_group_list_path) #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild for all samples because # the bad regions are combined across all samples. However, a missing # output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" input_files = input_file_list + [vcf_file_path] preserved_needs_rebuild = utils.target_needs_rebuild(input_files, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild(input_files, removed_vcf_file_path) need_rebuild_dict[vcf_file_path] = force_flag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print("All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild.") return #========================================================================== # Find all bad regions in one sample at a time #========================================================================== for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue sample_ID = utils.sample_id_from_file(vcf_file_path) utils.verbose_print("Processing sample %s" % sample_ID) if sample_ID in sorted_list_of_outgroup_samples: write_outgroup_preserved_and_removed_vcf_files(vcf_file_path, vcf_reader) else: # The bad_regions_dict holds the bad regions for this sample # Key is the contig ID, and the value is a list of bad region tuples (start_position, end_position). bad_regions_dict = dict() collect_dense_regions(vcf_reader, bad_regions_dict, contig_length_dict, edge_length, max_num_snps_list, window_size_list) # Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): combined_regions = utils.merge_regions(regions) bad_regions_dict[contig] = combined_regions # Write the output files write_preserved_and_removed_vcf_files(vcf_file_path, bad_regions_dict) vcf_reader_handle.close()
def calculate_snp_distances(options_dict): """Calculate pairwise sample SNP distances. Description: Calculate pairwise SNP distances from the multi-fasta SNP matrix. Generate a file of pairwise distances and a file containing a matrix of distances. This function expects, or creates '(*)', the following files: snpma.fasta snp_distance_pairwise.tsv* snp_distance_matrix.tsv* The files are used as follows: 1. The snpma.fasta input file contains the snp matrix for all samples 2. The snp_distance_pairwise.tsv output file contains a three column tab-separated table of distances between all pairs of samples 2. The snp_distance_matrix.tsv output file contains a matrix of distances between all samples. Args: inputFile: File path (not just file name) for the snp matrix in fasta format pairwiseFile: File path (not just file name) of the output pairwise distance file matrixFile: File path (not just file name) for the output distance matrix file Raises: Examples: options_dict = {'inputFile':'snpma.fasta', 'pairwiseFile':'snp_distance_pairwise.tsv', 'matrixFile':'snp_distance_matrix.tsv' } calculate_snp_distances(options_dict) """ print_log_header() verbose_print("# %s %s" % (utils.timestamp(), utils.command_line_short())) verbose_print("# %s version %s" % (utils.program_name(), __version__)) print_arguments(options_dict) #========================================================================== # Validate arguments #========================================================================== input_file = options_dict['inputFile'] pairwise_file = options_dict['pairwiseFile'] matrix_file = options_dict['matrixFile'] force_flag = options_dict['forceFlag'] bad_file_count = utils.verify_existing_input_files("SNP matrix file", [input_file]) if bad_file_count > 0: utils.global_error("Error: cannot calculate sequence distances without the snp matrix file.") if not pairwise_file and not matrix_file: utils.global_error("Error: no output file specified.") #========================================================================== # Check freshness #========================================================================== rebuild_pairwise_file = pairwise_file and utils.target_needs_rebuild([input_file], pairwise_file) rebuild_matrix_file = matrix_file and utils.target_needs_rebuild([input_file], matrix_file) if force_flag or rebuild_pairwise_file or rebuild_matrix_file: #------------------------------ # Read in snp matrix file #------------------------------ seqs = {} with open(input_file) as ifile: for line in ifile: line = line.rstrip('\n') if line.startswith('>'): curr_sample = line.lstrip('>') seqs[curr_sample] = '' else: seqs[curr_sample] += str(line) #------------------------------ # Count mismatches #------------------------------ ids = sorted(seqs.keys()) pairwise_mismatches = dict() # tuple (seq1 id, seq2 id) -> int for id1, id2 in itertools.combinations(ids, 2): mismatches = utils.calculate_sequence_distance(seqs[id1], seqs[id2]) pairwise_mismatches[(id1, id2)] = mismatches pairwise_mismatches[(id2, id1)] = mismatches #------------------------------ # Print distance files #------------------------------ if pairwise_file: with open(pairwise_file, 'w') as p_out: p_out.write('%s\n' % '\t'.join(['Seq1', 'Seq2', 'Distance'])) for id1, id2 in itertools.product(ids, ids): mismatches = pairwise_mismatches.get((id1, id2), 0) # zero when id1=id2 p_out.write("%s\t%s\t%i\n" % (id1, id2, mismatches)) if matrix_file: with open(matrix_file, 'w') as m_out: m_out.write('\t%s\n' % '\t'.join(ids)) # matrix header # write table of mismatches for id1 in ids: mismatches = [pairwise_mismatches.get((id1, id2), 0) for id2 in ids] mismatch_strs = map(str, mismatches) m_out.write("%s\t%s\n" % (id1, '\t'.join(mismatch_strs))) else: verbose_print("Distance files have already been freshly built. Use the -f option to force a rebuild.") verbose_print("# %s %s finished" % (utils.timestamp(), utils.program_name()))
def combine_metrics(args): """Combine the per-sample metrics files into a single table of metrics for all samples. This function expects, or creates '(*)', the following files arranged in the following way: samples sample_name_one/metrics metrics.tsv All the input files are created outside of this function. Before running this command, the metrics file for each sample must be created by the collect_metrics command. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace sampleDirsFile : Path to file containing a list of directories -- one per sample metricsFileName : File name of the metrics files which must exist in each of the sample directories mergedMetricsFile : Path to the output merged metrics file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== sample_directories_list_path = args.sampleDirsFile utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path], error_handler="global") metrics_file_name = args.metricsFileName merged_metrics_path = args.mergedMetricsFile with open(sample_directories_list_path, "r") as f: sample_directories = [line.rstrip() for line in f] sample_directories = [d for d in sample_directories if d] metrics_files = [os.path.join(d, metrics_file_name) for d in sample_directories] #========================================================================== # Check if merge has already been done #========================================================================== needs_rebuild = utils.target_needs_rebuild(metrics_files, merged_metrics_path) if not args.forceFlag and not needs_rebuild: verbose_print("# The merged metrics file is already freshly created. Use the -f option to force a rebuild.") return #========================================================================== # Parse the metrics files and print the tabular results #========================================================================== with open(merged_metrics_path, 'w') as f: # Emit the column headings column_headings = ["Sample", "Fastq Files", "Fastq File Size", "Machine", "Flowcell", "Number of Reads", "Duplicate Reads", "Percent of Reads Mapped", "Percent Proper Pair", "Average Insert Size", "Average Pileup Depth", "Phase1 SNPs", "Phase1 Preserved SNPs", "Phase2 SNPs", "Phase2 Preserved SNPs", "Missing SNP Matrix Positions", "Missing Preserved SNP Matrix Positions", "Excluded Sample", "Excluded Preserved Sample", "Warnings and Errors"] if not args.spaceHeadings: column_headings = [heading.replace(' ', '_') for heading in column_headings] tabbed_headings = '\t'.join(column_headings) f.write(tabbed_headings + '\n') # Reads the metrics from each sample, and emit the values for metrics_file in metrics_files: verbose_print("Processing " + metrics_file) message = None if not os.path.isfile(metrics_file): message = "Sample metrics file %s does not exist." % metrics_file elif os.path.getsize(metrics_file) == 0: message = "Sample metrics file %s is empty." % metrics_file if message: f.write(message + '\n') utils.sample_warning(message) continue metrics = utils.read_properties(metrics_file) f.write(quoted(metrics.get("sample", "")) + '\t') f.write(quoted(metrics.get("fastqFileList", "")) + '\t') f.write(metrics.get("fastqFileSize", "") + '\t') f.write(metrics.get("machine", "") + '\t') f.write(metrics.get("flowcell", "") + '\t') f.write(metrics.get("numberReads", "") + '\t') f.write(metrics.get("numberDupReads", "") + '\t') f.write(metrics.get("percentReadsMapped", "") + '\t') f.write(metrics.get("percentProperPair", "") + '\t') f.write(metrics.get("aveInsertSize", "") + '\t') f.write(metrics.get("avePileupDepth", "") + '\t') f.write(metrics.get("phase1Snps", "") + '\t') f.write(metrics.get("phase1SnpsPreserved", "") + '\t') f.write(metrics.get("snps", "") + '\t') f.write(metrics.get("snpsPreserved", "") + '\t') f.write(metrics.get("missingPos", "") + '\t') f.write(metrics.get("missingPosPreserved", "") + '\t') f.write(metrics.get("excludedSample", "") + '\t') f.write(metrics.get("excludedSamplePreserved", "") + '\t') f.write(quoted(metrics.get("errorList", "")) + '\n')
def map_reads(args): """Align reads to the reference. Execute an external program (bowtie2 or smalt) to map the fastq reads to a reference file. The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/sampleFastqFile_1.fastq sample_name_one/sampleFastqFile_2.fastq sample_name_one/reads.sam* The reverse fastq file is optional. The fastq files may be either compressed with gzip or uncompressed. The reverse fastq file is optional. All the input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleFastqFile1 : File path of the forward fastq file sampleFastqFile2 : Optional file path of the reverse fastq file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") # Verify fastq files exist and are not empty sample_fastq_file1 = args.sampleFastqFile1 sample_fastq_file2 = args.sampleFastqFile2 fastq_files = [sample_fastq_file1] if sample_fastq_file2: fastq_files.append(sample_fastq_file2) utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample") # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2" snp_pipeline_aligner = snp_pipeline_aligner.lower() if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.global_error("Error: only bowtie2 and smalt aligners are supported.") sample_dir = os.path.dirname(sample_fastq_file1) sample_id = utils.sample_id_from_file(sample_fastq_file1) reference_base_path = os.path.splitext(reference_file_path)[0] # strip the file extension reference_id = os.path.basename(reference_base_path) #========================================================================== # Check if alignment to reference has already been done #========================================================================== sam_file = os.path.join(sample_dir, "reads.sam") source_files = [sample_fastq_file1] if sample_fastq_file2: source_files.append(sample_fastq_file2) if snp_pipeline_aligner == "bowtie2": source_files.append(reference_base_path + ".rev.1.bt2") elif snp_pipeline_aligner == "smalt": source_files.append(reference_base_path + ".smi") needs_rebuild = utils.target_needs_rebuild(source_files, sam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# %s has already been aligned to %s. Use the -f option to force a rebuild." % (sample_id, reference_id)) return #========================================================================== # Construct the command line to execute bowtie2 or smalt #========================================================================== # The read group identifies reads from a single run and lane read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1, sample_id) # Default to 8 cores on HPC or all cpu cores on workstation if os.environ.get("JOB_ID") or os.environ.get("PBS_JOBID"): num_cores = 8 else: num_cores = psutil.cpu_count() num_cores_param = "" if snp_pipeline_aligner == "bowtie2": version_str = utils.extract_version_str("bowtie2", "bowtie2 --version") # Parse the user-specified bowtie parameters to determine if the user specified the number of CPU cores bowtie2_align_extra_params = os.environ.get("Bowtie2Align_ExtraParams") or "" if not utils.detect_numeric_option_in_parameters_str(bowtie2_align_extra_params, "-p"): num_cores_param = "-p " + str(num_cores) # Specify the read group and sample tags here, --rg tags cannot be specified without ID. # The read group tags are used by some downstream tools, like Picard and GATK. read_group_params = "" if read_group_tags: read_group_params += " --rg-id " + read_group_tags.ID read_group_params += " --rg SM:" + read_group_tags.SM read_group_params += " --rg LB:" + read_group_tags.LB read_group_params += " --rg PL:" + read_group_tags.PL read_group_params += " --rg PU:" + read_group_tags.PU # Substitute the default parameters if the user did not specify bowtie parameters bowtie2_align_params = bowtie2_align_extra_params or "--reorder -q" # Build the command with options depending on whether the fastq files are paired command_line = "bowtie2 " + num_cores_param + " " + read_group_params + " " + bowtie2_align_params + " -x " + reference_base_path if sample_fastq_file2: command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2 else: command_line += " -U " + sample_fastq_file1 elif snp_pipeline_aligner == "smalt": version_str = utils.extract_version_str("smalt", "smalt version") # Parse the user-specified smalt parameters to determine if the user specified the number of CPU cores smalt_align_extra_params = os.environ.get("SmaltAlign_ExtraParams") or "" if not utils.detect_numeric_option_in_parameters_str(smalt_align_extra_params, "-n"): num_cores_param = "-n " + str(num_cores) # Substitute the default parameters if the user did not specify smalt parameters smalt_align_params = smalt_align_extra_params or "-O" # Don't use the -i 1000 option if the fastq file is unpaired if not sample_fastq_file2: smalt_align_params = re.sub("-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute command_line = "smalt map " + num_cores_param + " " + smalt_align_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (sample_fastq_file2 or "") #========================================================================== # Run the command to execute bowtie2 or smalt #========================================================================== verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sam_file) #========================================================================== # When using smalt, assign read groups in a separate step. # This is already done when using bowtie2. #========================================================================== if snp_pipeline_aligner == "smalt" and read_group_tags: smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam") shutil.move(sam_file, smalt_sam_file) version_str = utils.extract_version_str("Picard", "java picard.cmdline.PicardCommandLine AddOrReplaceReadGroups --version 2>&1") jvm_params = os.environ.get("PicardJvm_ExtraParams") or "" command_line = "java " + jvm_params + " picard.cmdline.PicardCommandLine AddOrReplaceReadGroups" command_line += " I=" + smalt_sam_file command_line += " O=" + sam_file command_line += " RGID=" + read_group_tags.ID command_line += " RGSM=" + read_group_tags.SM command_line += " RGLB=" + read_group_tags.LB command_line += " RGPL=" + read_group_tags.PL command_line += " RGPU=" + read_group_tags.PU verbose_print("") verbose_print("# Assign read group id %s" % (read_group_tags.ID)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout)
def create_snp_reference_seq(args): """Write reference sequence bases at SNP locations to a fasta file. Write reference sequence bases at SNP locations to a fasta file. This function expects, or creates '(*)', the following files: reference.fasta snplist.txt referenceSNP.fasta (*) The files are used as follows: 1. The reference.fasta input file contains the whole-genome reference bases. 2. The snplist.txt input file contains the list of SNP positions across all the samples. 2. The referenceSNP.fasta output file contains the reference bases at the identified SNP locations. The snplist.txt file is created outside of this function. The package documentation provides an example of creating this file based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : Namespace referenceFile: File path (not just file name) for reference sequence in fasta format snpListFile: File path (not just file name) of text format list of SNP positions snpRefFile: File path (not just file name) for the SNP reference sequence file. Raises: Examples: args = argparse.Namespace args.referenceFile = 'reference.fasta' args.snpListFile = 'snplist.txt' args.snpRefFile = 'referenceSNP.fasta' create_snp_reference_seq(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Write reference sequence bases at SNP locations to a fasta file. #========================================================================== reference_file = args.referenceFile snp_list_file_path = args.snpListFile snp_ref_seq_path = args.snpRefFile #========================================================================== # Verify input files exist #========================================================================== bad_file_count = utils.verify_existing_input_files("Snplist file", [snp_list_file_path]) if bad_file_count > 0: utils.global_error("Error: cannot create the snp reference sequence without the snplist file.") bad_file_count = utils.verify_non_empty_input_files("Reference file", [reference_file]) if bad_file_count > 0: utils.global_error("Error: cannot create the snp reference sequence without the reference fasta file.") #========================================================================== # Find the reference bases at the snp positions #========================================================================== source_files = [reference_file, snp_list_file_path] if args.forceFlag or utils.target_needs_rebuild(source_files, snp_ref_seq_path): utils.write_reference_snp_file(reference_file, snp_list_file_path, snp_ref_seq_path) else: verbose_print("SNP reference sequence %s has already been freshly built. Use the -f option to force a rebuild." % snp_ref_seq_path)
def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sorted.deduped.indelrealigned.bam sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true" enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true" input_bam_file = os.path.join(sample_dir, "reads.sorted.bam") input_bam_file = utils.add_file_suffix(input_bam_file, ".deduped", enable=remove_duplicate_reads) input_bam_file = utils.add_file_suffix(input_bam_file, ".indelrealigned", enable=enable_local_realignment) utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild( [input_bam_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get( "SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH") if not jar_file_path: utils.global_error( "Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable." ) else: version_str = utils.extract_version_str( "VarScan", "java -jar " + jar_file_path + " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2") varscan_jvm_extra_params = os.environ.get( "VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get( "VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
def filter_regions(args): """Remove bad SNPs from original vcf files Remove bad SNPs -- this function finds bad regions, including the edges and probable prophage regions; then remove SNPs in these regions in original vcf files of all samples. This function expects, or creates '(*)', the following files arranged in the following way: sampleDirectories.txt samples sample_name_one/var.flt.vcf sample_name_one/var.flt_removed.vcf (*) sample_name_one/var.flt_preserved.vcf (*) ... The files are used as follows: 1. The sampleDirectories.txt input file contains a list of the paths to the sample directories. 2. The var.flt.vcf variant input files (i.e., the original vcf file). 3. The var.flt_removed.vcf and var.flt_preserved.vcf output files contain the removed SNPs and preserved SNPs. The sampleDirectories.txt and var.flt.vcf files are created outside of this function. The package documentation provides an example of creating these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- Args: sampleDirsFile: File path (not just file name) of file containing paths to directories containing var.flt.vcf file for each sequence. vcfFileName: File name of the VCF files which must exist in each of the sample directories refFastaFile: File path (not just file name) of reference fasta file edgeLength: the length of edge of a contig in which SNPs will be removed. Default is 500. windowSize: the size of the window in which max number of SNPs are allowed. Default is 1000. maxSNP: the maximum number of SNPs allowed in a window of a size defined in windowSize. Default is 3. Raises: Examples: args = argparse.Namespace args.sampleDirsFile = 'sampleDirectories.txt' args.vcfFileName = 'var.flt.vcf' args.refFastaFile = 'snplist.txt' remove_bad_snp(args) """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate some parameters #========================================================================== edge_length = args.edgeLength window_size = args.windowSize max_num_snp = args.maxSNP #========================================================================== # Prep work #========================================================================== sample_directories_list_path = args.sampleDirsFile bad_file_count = utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path]) if bad_file_count > 0: utils.global_error(None) with open(sample_directories_list_path, "r") as sample_directories_list_file: unsorted_list_of_sample_directories = [line.rstrip() for line in sample_directories_list_file] unsorted_list_of_sample_directories = [d for d in unsorted_list_of_sample_directories if d] sorted_list_of_sample_directories = sorted(unsorted_list_of_sample_directories) input_file_list = list() out_group_list_path = args.outGroupFile sorted_list_of_outgroup_samples = list() if out_group_list_path is not None: bad_file_count = utils.verify_non_empty_input_files("File of outgroup samples", [out_group_list_path]) if bad_file_count > 0: utils.global_error(None) try: #There are outgroup samples input_file_list.append(out_group_list_path) with open(out_group_list_path, "r") as out_group_list_file: unsorted_list_of_outgroup_samples = [line.rstrip() for line in out_group_list_file] sorted_list_of_outgroup_samples = sorted(unsorted_list_of_outgroup_samples) except: utils.global_error("Error: Cannot open the file containing the list of outgroup samples!") #========================================================================== # Validate inputs #========================================================================== vcf_file_name = args.vcfFileName list_of_vcf_files = [os.path.join(dir, vcf_file_name) for dir in sorted_list_of_sample_directories] input_file_list.extend(list_of_vcf_files) bad_file_count = utils.verify_non_empty_input_files("VCF file", list_of_vcf_files) if bad_file_count == len(list_of_vcf_files): utils.global_error("Error: all %d VCF files were missing or empty." % bad_file_count) elif bad_file_count > 0: utils.sample_error("Error: %d VCF files were missing or empty." % bad_file_count, continue_possible=True) bad_file_count = utils.verify_non_empty_input_files("Reference file", [args.refFastaFile]) if bad_file_count > 0: utils.global_error(None) #========================================================================== # Get contigs' length from the reference fasta file #========================================================================== try: handle = open(args.refFastaFile, "r") contig_length_dict = dict() for record in SeqIO.parse(handle, "fasta"): #build contig_length_dict contig_length_dict[record.id] = len(record.seq) input_file_list.append(args.refFastaFile) except: utils.global_error("Error: cannot open the reference fastq file, or fail to read the contigs in the reference fastq file.") else: if handle: handle.close() #========================================================================== # Which samples need rebuild? # # Any changed or new input file will trigger rebuild for all samples because # the bad regions are combined across all samples. However, a missing # output file will only cause rebuild of the missing file. #========================================================================== need_rebuild_dict = dict() for vcf_file_path in list_of_vcf_files: preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" preserved_needs_rebuild = utils.target_needs_rebuild(input_file_list, preserved_vcf_file_path) removed_needs_rebuild = utils.target_needs_rebuild(input_file_list, removed_vcf_file_path) need_rebuild_dict[vcf_file_path] = args.forceFlag or preserved_needs_rebuild or removed_needs_rebuild if not any(need_rebuild_dict.values()): utils.verbose_print("All preserved and removed vcf files are already freshly built. Use the -f option to force a rebuild.") return #========================================================================== # Find all bad regions. #========================================================================== bad_regions_dict = dict() # Key is the contig ID, and the value is a list of bad regions. for vcf_file_path in list_of_vcf_files: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID in sorted_list_of_outgroup_samples: if not need_rebuild_dict[vcf_file_path]: vcf_reader_handle.close() continue #Copy original vcf file to _preserved.vcf, and created an empty _removed.vcf #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: #print "Cannot create the file for removed SNPs: %d." % removed_vcf_file_path #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue vcf_writer_removed.close() vcf_reader_handle.close() shutil.copyfile(vcf_file_path, preserved_vcf_file_path) else: #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. snp_dict = defaultdict(list) for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. record = (vcf_data_line.POS, vcf_data_line) snp_dict[vcf_data_line.CHROM].append(record) #Find bad regions and add them into bad_region for contig, snp_list in snp_dict.items(): #sort all SNPs in this contig by position sorted_list = sorted(snp_list, key=lambda SNPs: SNPs[0]) #total number of SNPs num_of_snp = len(sorted_list) if contig not in bad_regions_dict: #New contig try: contig_length = contig_length_dict[contig] except: #cannot find contig length. Use the sys.maxsize. contig_length = sys.maxsize if (contig_length <= (edge_length * 2)): bad_regions_dict[contig] = [(0, contig_length)] else: region = [(0, edge_length), (contig_length - edge_length, contig_length)] bad_regions_dict[contig] = region #Process SNPs for idx, snp in enumerate(sorted_list): if (idx + max_num_snp) < num_of_snp: pos_start = snp[0] pos_end = sorted_list[idx + max_num_snp][0] if (pos_start + window_size) >= pos_end: #Add bad region regions = bad_regions_dict[contig] temp_region = (pos_start, pos_end) regions.append(temp_region) vcf_reader_handle.close() #Combine all bad regions for each contig for contig, regions in bad_regions_dict.items(): sorted_regions = utils.sort_coord(regions) combined_regions = utils.consensus(sorted_regions) bad_regions_dict[contig] = combined_regions #Scan vcf files to remove SNPs for vcf_file_path in list_of_vcf_files: if not need_rebuild_dict[vcf_file_path]: continue #Get sample ID ss = vcf_file_path.split('/') sample_ID = ss[-2] if sample_ID not in sorted_list_of_outgroup_samples: try: vcf_reader_handle = open(vcf_file_path, 'r') vcf_reader = vcf.Reader(vcf_reader_handle) except: utils.sample_error("Error: Cannot open the input vcf file: %s." % vcf_file_path, continue_possible=True) continue #SNP list, saved as (Contig_Name, [(SNP_Position, SNP_Record),]), where SNP_Record is a line in VCF. preserved_vcf_file_path = vcf_file_path[:-4] + "_preserved.vcf" removed_vcf_file_path = vcf_file_path[:-4] + "_removed.vcf" try: vcf_writer_preserved = None vcf_writer_preserved = vcf.Writer(open(preserved_vcf_file_path, 'w'), vcf_reader) except: if vcf_writer_preserved is not None: vcf_writer_preserved.close() os.remove(preserved_vcf_file_path) vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for preserved SNPs: %s." % preserved_vcf_file_path, continue_possible=True) continue try: vcf_writer_removed = None vcf_writer_removed = vcf.Writer(open(removed_vcf_file_path, 'w'), vcf_reader) except: #close vcf_writer_reserved and remove the file reserved_vcf_file_path if vcf_writer_removed is not None: vcf_writer_removed.close() os.remove(removed_vcf_file_path) vcf_writer_preserved.close() vcf_reader_handle.close() utils.sample_error("Error: Cannot create the file for removed SNPs: %s." % removed_vcf_file_path, continue_possible=True) continue for vcf_data_line in vcf_reader: #Create a dict to store all SNPs in this sample #get contig length from contig name.The CHROM should be a contig name in the format of Velvet/SPAdes output. contig = vcf_data_line.CHROM if utils.in_region(vcf_data_line.POS, bad_regions_dict[contig]): #Remove this SNP vcf_writer_removed.write_record(vcf_data_line) else: #Preserve this SNP vcf_writer_preserved.write_record(vcf_data_line) vcf_writer_preserved.close() vcf_writer_removed.close() vcf_reader_handle.close()
def map_reads(args): """Align reads to the reference. Execute an external program (bowtie2 or smalt) to map the fastq reads to a reference file. The sample alignment is sorted, duplicate reads are marked, and reads realigned around indels. The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/sampleFastqFile_1.fastq sample_name_one/sampleFastqFile_2.fastq sample_name_one/reads.sam* sample_name_one/reads.unsorted.bam* sample_name_one/reads.sorted.bam* sample_name_one/reads.sorted.deduped.bam* sample_name_one/reads.sorted.deduped.bai* sample_name_one/realign.target.intervals* sample_name_one/reads.sorted.deduped.indelrealigned.bam* The fastq files may be either compressed with gzip or uncompressed. The reverse fastq file is optional. All the input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleFastqFile1 : File path of the forward fastq file sampleFastqFile2 : Optional file path of the reverse fastq file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") # Verify fastq files exist and are not empty sample_fastq_file1 = args.sampleFastqFile1 sample_fastq_file2 = args.sampleFastqFile2 fastq_files = [sample_fastq_file1] if sample_fastq_file2: fastq_files.append(sample_fastq_file2) utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample") # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2" snp_pipeline_aligner = snp_pipeline_aligner.lower() if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.global_error( "Error: only bowtie2 and smalt aligners are supported.") sample_dir = os.path.dirname(sample_fastq_file1) sample_id = utils.sample_id_from_file(sample_fastq_file1) reference_base_path = os.path.splitext(reference_file_path)[ 0] # strip the file extension reference_id = os.path.basename(reference_base_path) num_threads = args.threads #========================================================================== # verify jar files are in CLASSPATH #========================================================================== picard_jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH") if not picard_jar_file_path: utils.global_error( "Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable." ) picard_version_str = utils.extract_version_str( "Picard", "java -jar " + picard_jar_file_path + " AddOrReplaceReadGroups --version 2>&1") gatk_jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK", "CLASSPATH") if not gatk_jar_file_path: utils.global_error( "Error: cannot execute GATK. Define the path to GenomeAnalysisTK.jar in the CLASSPATH environment variable." ) gatk_version_str = utils.extract_version_str( "GATK", "java -jar " + gatk_jar_file_path + " --version 2>&1") #========================================================================== # Enforce the proper SAMtools version #========================================================================== samtools_version_str = utils.extract_version_str( "SAMtools", "samtools 2>&1 > /dev/null") samtools_version = samtools_version_str.split()[-1] # just the number if samtools_version < "1.4": utils.global_error( "The installed %s is not supported. Version 1.4 or higher is required." % samtools_version_str) #========================================================================== # Check if alignment to reference has already been done #========================================================================== sam_file = os.path.join(sample_dir, "reads.sam") source_files = [sample_fastq_file1] if sample_fastq_file2: source_files.append(sample_fastq_file2) if snp_pipeline_aligner == "bowtie2": source_files.append(reference_base_path + ".rev.1.bt2") elif snp_pipeline_aligner == "smalt": source_files.append(reference_base_path + ".smi") needs_rebuild = utils.target_needs_rebuild(source_files, sam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# %s has already been aligned to %s. Use the -f option to force a rebuild." % (sample_id, reference_id)) else: #========================================================================== # Construct the command line to execute bowtie2 or smalt #========================================================================== # The read group identifies reads from a single run and lane read_group_tags = fastq.construct_read_group_tags( sample_fastq_file1, sample_id) # Make up dummy read group tags if the read group information is missing from the fastq files. # GATK components require these tags. if read_group_tags is None: id = "1" sm = sample_id lb = "1" pl = None pu = sample_id read_group_tags = fastq.ReadGroupTags(id, sm, lb, pl, pu) if snp_pipeline_aligner == "bowtie2": version_str = utils.extract_version_str("bowtie2", "bowtie2 --version") # Substitute the default parameters if the user did not specify bowtie parameters os.environ["Bowtie2Align_ExtraParams"] = os.environ.get( "Bowtie2Align_ExtraParams") or "--reorder" # Set the number of threads to use utils.configure_process_threads("Bowtie2Align_ExtraParams", "-p", num_threads, None) bowtie2_align_extra_params = os.environ["Bowtie2Align_ExtraParams"] # Specify the read group and sample tags here, --rg tags cannot be specified without ID. # The read group tags are used by some downstream tools, like Picard and GATK. read_group_params = "" read_group_params += " --rg-id " + read_group_tags.ID read_group_params += " --rg SM:" + read_group_tags.SM read_group_params += " --rg LB:" + read_group_tags.LB if read_group_tags.PL is not None: read_group_params += " --rg PL:" + read_group_tags.PL read_group_params += " --rg PU:" + read_group_tags.PU # Build the command with options depending on whether the fastq files are paired command_line = "bowtie2 " + read_group_params + " " + bowtie2_align_extra_params + " -x " + reference_base_path if sample_fastq_file2: command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2 else: command_line += " -U " + sample_fastq_file1 elif snp_pipeline_aligner == "smalt": version_str = utils.extract_version_str("smalt", "smalt version") # Substitute the default parameters if the user did not specify smalt parameters os.environ["SmaltAlign_ExtraParams"] = os.environ.get( "SmaltAlign_ExtraParams") or "-O" # Set the number of threads to use utils.configure_process_threads("SmaltAlign_ExtraParams", "-n", num_threads, None) smalt_align_extra_params = os.environ["SmaltAlign_ExtraParams"] # Don't use the -i 1000 option if the fastq file is unpaired if not sample_fastq_file2: smalt_align_params = re.sub( "-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute command_line = "smalt map " + smalt_align_extra_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + ( sample_fastq_file2 or "") #========================================================================== # Run the command to execute bowtie2 or smalt #========================================================================== verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sam_file) #========================================================================== # When using smalt, assign read groups in a separate step. # This is already done when using bowtie2. #========================================================================== if snp_pipeline_aligner == "smalt" and read_group_tags: smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam") shutil.move(sam_file, smalt_sam_file) jvm_params = os.environ.get("PicardJvm_ExtraParams") or "" command_line = "java " + jvm_params + " -jar " + picard_jar_file_path + " AddOrReplaceReadGroups" command_line += " I=" + smalt_sam_file command_line += " O=" + sam_file command_line += " RGID=" + read_group_tags.ID command_line += " RGSM=" + read_group_tags.SM command_line += " RGLB=" + read_group_tags.LB if read_group_tags.PL is None: command_line += " RGPL=unknown" # Picard requires this command line option else: command_line += " RGPL=" + read_group_tags.PL command_line += " RGPU=" + read_group_tags.PU verbose_print("") verbose_print("# Assign read group id %s" % (read_group_tags.ID)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % picard_version_str) command.run(command_line, sys.stdout) verbose_print("") #========================================================================== # Convert sam to bam file, selecting only the mapped reads #========================================================================== # Check for fresh bam file; if not, convert to bam file with only mapped reads unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam") needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Unsorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: # Substitute the default parameters if the user did not specify samtools view parameters os.environ["SamtoolsSamFilter_ExtraParams"] = os.environ.get( "SamtoolsSamFilter_ExtraParams") or "-F 4" # Set the number of threads to use utils.configure_process_threads("SamtoolsSamFilter_ExtraParams", ["-@", "--threads"], num_threads, None) samtools_samfilter_params = os.environ["SamtoolsSamFilter_ExtraParams"] command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file verbose_print( "# Convert sam file to bam file with only mapped positions.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % samtools_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view") verbose_print("") #========================================================================== # Sort the BAM file #========================================================================== # Check for fresh sorted bam file; if not, sort it sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam") needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Sorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: # Set the number of threads to use utils.configure_process_threads("SamtoolsSort_ExtraParams", ["-@", "--threads"], num_threads, None) samtools_sort_extra_params = os.environ["SamtoolsSort_ExtraParams"] command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file verbose_print("# Convert bam to sorted bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % samtools_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort") verbose_print("") #========================================================================== # Mark duplicate reads, so they will be ignored in subsequent steps #========================================================================== remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true" input_file = sorted_bam_file output_file = utils.add_file_suffix(input_file, ".deduped", enable=remove_duplicate_reads) if remove_duplicate_reads: # Check for fresh deduped bam file; if not, remove duplicate reads needs_rebuild = utils.target_needs_rebuild([input_file], output_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Deduped bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: picard_jvm_extra_params = os.environ.get( "PicardJvm_ExtraParams") or "" picard_mark_duplicates_extra_params = os.environ.get( "PicardMarkDuplicates_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") tmp_option = " TMP_DIR=" + tmpdir if tmpdir else "" command_line = "java " + picard_jvm_extra_params + " -jar " + picard_jar_file_path + " MarkDuplicates INPUT=" + input_file + " OUTPUT=" + output_file + " METRICS_FILE=" + os.path.join( sample_dir, "duplicate_reads_metrics.txt" ) + tmp_option + ' ' + picard_mark_duplicates_extra_params verbose_print("# Mark duplicate reads in bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % picard_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(output_file, "picard MarkDuplicates") verbose_print("") #========================================================================== # Next three steps are part of local realignment around indels #========================================================================== enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true" #========================================================================== # Index the sorted bam file prior to RealignerTargetCreator #========================================================================== input_file = output_file # output from last step becomes input to this step if enable_local_realignment: # Check for fresh bai file; if not, index it bam_index_file = input_file[:-3] + "bai" needs_rebuild = utils.target_needs_rebuild([input_file], bam_index_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Bam file index is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: # Set the number of threads to use utils.configure_process_threads("SamtoolsIndex_ExtraParams", "-@", num_threads, None) samtools_index_extra_params = os.environ[ "SamtoolsIndex_ExtraParams"] command_line = "samtools index " + samtools_index_extra_params + ' ' + input_file + ' ' + bam_index_file verbose_print("# Index bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % samtools_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(bam_index_file, "samtools index") verbose_print("") #========================================================================== # Identify targets for realignment #========================================================================== if enable_local_realignment: # Check for fresh realign_targets_file file; if not run RealignerTargetCreator realign_targets_file = os.path.join(sample_dir, "realign.target.intervals") needs_rebuild = utils.target_needs_rebuild( [input_file, bam_index_file], realign_targets_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Realign targets file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params: gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir # Set the number of threads to use utils.configure_process_threads( "RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"], num_threads, None) realigner_target_creator_extra_params = os.environ[ "RealignerTargetCreator_ExtraParams"] command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T RealignerTargetCreator -R " + reference_file_path + " -I " + input_file + " -o " + realign_targets_file + ' ' + realigner_target_creator_extra_params verbose_print("# Identify targets for realignment.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % gatk_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(realign_targets_file, "GATK RealignerTargetCreator", empty_ok=True) verbose_print("") #========================================================================== # Realign around indels #========================================================================== output_file = utils.add_file_suffix(input_file, ".indelrealigned", enable=enable_local_realignment) if enable_local_realignment: # Check for fresh indelrealigned bam file; if not run IndelRealigner needs_rebuild = utils.target_needs_rebuild( [input_file, bam_index_file, realign_targets_file], output_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Indelrealigned bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params: gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir indel_realigner_extra_params = os.environ.get( "IndelRealigner_ExtraParams") or "" command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T IndelRealigner -R " + reference_file_path + " -targetIntervals " + realign_targets_file + " -I " + input_file + " -o " + output_file + ' ' + indel_realigner_extra_params verbose_print("# Realign around indels") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % gatk_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(output_file, "GATK IndelRealigner")
def map_reads(args): """Align reads to the reference. Execute an external program (bowtie2 or smalt) to map the fastq reads to a reference file. The sample alignment is sorted, duplicate reads are marked, and reads realigned around indels. The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/sampleFastqFile_1.fastq sample_name_one/sampleFastqFile_2.fastq sample_name_one/reads.sam* sample_name_one/reads.unsorted.bam* sample_name_one/reads.sorted.bam* sample_name_one/reads.sorted.deduped.bam* sample_name_one/reads.sorted.deduped.bai* sample_name_one/realign.target.intervals* sample_name_one/reads.sorted.deduped.indelrealigned.bam* The fastq files may be either compressed with gzip or uncompressed. The reverse fastq file is optional. All the input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleFastqFile1 : File path of the forward fastq file sampleFastqFile2 : Optional file path of the reverse fastq file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") # Verify fastq files exist and are not empty sample_fastq_file1 = args.sampleFastqFile1 sample_fastq_file2 = args.sampleFastqFile2 fastq_files = [sample_fastq_file1] if sample_fastq_file2: fastq_files.append(sample_fastq_file2) utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample") # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2" snp_pipeline_aligner = snp_pipeline_aligner.lower() if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.global_error("Error: only bowtie2 and smalt aligners are supported.") sample_dir = os.path.dirname(sample_fastq_file1) sample_id = utils.sample_id_from_file(sample_fastq_file1) reference_base_path = os.path.splitext(reference_file_path)[0] # strip the file extension reference_id = os.path.basename(reference_base_path) num_threads = args.threads #========================================================================== # verify jar files are in CLASSPATH #========================================================================== picard_jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH") if not picard_jar_file_path: utils.global_error("Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable.") picard_version_str = utils.extract_version_str("Picard", "java -jar " + picard_jar_file_path + " AddOrReplaceReadGroups --version 2>&1") gatk_jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK", "CLASSPATH") if not gatk_jar_file_path: utils.global_error("Error: cannot execute GATK. Define the path to GenomeAnalysisTK.jar in the CLASSPATH environment variable.") gatk_version_str = utils.extract_version_str("GATK", "java -jar " + gatk_jar_file_path + " --version 2>&1") #========================================================================== # Enforce the proper SAMtools version #========================================================================== samtools_version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_version = samtools_version_str.split()[-1] # just the number if samtools_version < "1.4": utils.global_error("The installed %s is not supported. Version 1.4 or higher is required." % samtools_version_str) #========================================================================== # Check if alignment to reference has already been done #========================================================================== sam_file = os.path.join(sample_dir, "reads.sam") source_files = [sample_fastq_file1] if sample_fastq_file2: source_files.append(sample_fastq_file2) if snp_pipeline_aligner == "bowtie2": source_files.append(reference_base_path + ".rev.1.bt2") elif snp_pipeline_aligner == "smalt": source_files.append(reference_base_path + ".smi") needs_rebuild = utils.target_needs_rebuild(source_files, sam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# %s has already been aligned to %s. Use the -f option to force a rebuild." % (sample_id, reference_id)) else: #========================================================================== # Construct the command line to execute bowtie2 or smalt #========================================================================== # The read group identifies reads from a single run and lane read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1, sample_id) # Make up dummy read group tags if the read group information is missing from the fastq files. # GATK components require these tags. if read_group_tags is None: id = "1" sm = sample_id lb = "1" pl = None pu = sample_id read_group_tags = fastq.ReadGroupTags(id, sm, lb, pl, pu) if snp_pipeline_aligner == "bowtie2": version_str = utils.extract_version_str("bowtie2", "bowtie2 --version") # Substitute the default parameters if the user did not specify bowtie parameters os.environ["Bowtie2Align_ExtraParams"] = os.environ.get("Bowtie2Align_ExtraParams") or "--reorder" # Set the number of threads to use utils.configure_process_threads("Bowtie2Align_ExtraParams", "-p", num_threads, None) bowtie2_align_extra_params = os.environ["Bowtie2Align_ExtraParams"] # Specify the read group and sample tags here, --rg tags cannot be specified without ID. # The read group tags are used by some downstream tools, like Picard and GATK. read_group_params = "" read_group_params += " --rg-id " + read_group_tags.ID read_group_params += " --rg SM:" + read_group_tags.SM read_group_params += " --rg LB:" + read_group_tags.LB if read_group_tags.PL is not None: read_group_params += " --rg PL:" + read_group_tags.PL read_group_params += " --rg PU:" + read_group_tags.PU # Build the command with options depending on whether the fastq files are paired command_line = "bowtie2 " + read_group_params + " " + bowtie2_align_extra_params + " -x " + reference_base_path if sample_fastq_file2: command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2 else: command_line += " -U " + sample_fastq_file1 elif snp_pipeline_aligner == "smalt": version_str = utils.extract_version_str("smalt", "smalt version") # Substitute the default parameters if the user did not specify smalt parameters os.environ["SmaltAlign_ExtraParams"] = os.environ.get("SmaltAlign_ExtraParams") or "-O" # Set the number of threads to use utils.configure_process_threads("SmaltAlign_ExtraParams", "-n", num_threads, None) smalt_align_extra_params = os.environ["SmaltAlign_ExtraParams"] # Don't use the -i 1000 option if the fastq file is unpaired if not sample_fastq_file2: smalt_align_params = re.sub("-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute command_line = "smalt map " + smalt_align_extra_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (sample_fastq_file2 or "") #========================================================================== # Run the command to execute bowtie2 or smalt #========================================================================== verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sam_file) #========================================================================== # When using smalt, assign read groups in a separate step. # This is already done when using bowtie2. #========================================================================== if snp_pipeline_aligner == "smalt" and read_group_tags: smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam") shutil.move(sam_file, smalt_sam_file) jvm_params = os.environ.get("PicardJvm_ExtraParams") or "" command_line = "java " + jvm_params + " -jar " + picard_jar_file_path + " AddOrReplaceReadGroups" command_line += " I=" + smalt_sam_file command_line += " O=" + sam_file command_line += " RGID=" + read_group_tags.ID command_line += " RGSM=" + read_group_tags.SM command_line += " RGLB=" + read_group_tags.LB if read_group_tags.PL is None: command_line += " RGPL=unknown" # Picard requires this command line option else: command_line += " RGPL=" + read_group_tags.PL command_line += " RGPU=" + read_group_tags.PU verbose_print("") verbose_print("# Assign read group id %s" % (read_group_tags.ID)) verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % picard_version_str) command.run(command_line, sys.stdout) verbose_print("") #========================================================================== # Convert sam to bam file, selecting only the mapped reads #========================================================================== # Check for fresh bam file; if not, convert to bam file with only mapped reads unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam") needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Unsorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: # Substitute the default parameters if the user did not specify samtools view parameters os.environ["SamtoolsSamFilter_ExtraParams"] = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4" # Set the number of threads to use utils.configure_process_threads("SamtoolsSamFilter_ExtraParams", ["-@", "--threads"], num_threads, None) samtools_samfilter_params = os.environ["SamtoolsSamFilter_ExtraParams"] command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file verbose_print("# Convert sam file to bam file with only mapped positions.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % samtools_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view") verbose_print("") #========================================================================== # Sort the BAM file #========================================================================== # Check for fresh sorted bam file; if not, sort it sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam") needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Sorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: # Set the number of threads to use utils.configure_process_threads("SamtoolsSort_ExtraParams", ["-@", "--threads"], num_threads, None) samtools_sort_extra_params = os.environ["SamtoolsSort_ExtraParams"] command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file verbose_print("# Convert bam to sorted bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % samtools_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort") verbose_print("") #========================================================================== # Mark duplicate reads, so they will be ignored in subsequent steps #========================================================================== remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true" input_file = sorted_bam_file output_file = utils.add_file_suffix(input_file, ".deduped", enable=remove_duplicate_reads) if remove_duplicate_reads: # Check for fresh deduped bam file; if not, remove duplicate reads needs_rebuild = utils.target_needs_rebuild([input_file], output_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Deduped bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or "" picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") tmp_option = " TMP_DIR=" + tmpdir if tmpdir else "" command_line = "java " + picard_jvm_extra_params + " -jar " + picard_jar_file_path + " MarkDuplicates INPUT=" + input_file + " OUTPUT=" + output_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params verbose_print("# Mark duplicate reads in bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % picard_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(output_file, "picard MarkDuplicates") verbose_print("") #========================================================================== # Next three steps are part of local realignment around indels #========================================================================== enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true" #========================================================================== # Index the sorted bam file prior to RealignerTargetCreator #========================================================================== input_file = output_file # output from last step becomes input to this step if enable_local_realignment: # Check for fresh bai file; if not, index it bam_index_file = input_file[:-3] + "bai" needs_rebuild = utils.target_needs_rebuild([input_file], bam_index_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Bam file index is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: # Set the number of threads to use utils.configure_process_threads("SamtoolsIndex_ExtraParams", "-@", num_threads, None) samtools_index_extra_params = os.environ["SamtoolsIndex_ExtraParams"] command_line = "samtools index " + samtools_index_extra_params + ' ' + input_file + ' ' + bam_index_file verbose_print("# Index bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % samtools_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(bam_index_file, "samtools index") verbose_print("") #========================================================================== # Identify targets for realignment #========================================================================== if enable_local_realignment: # Check for fresh realign_targets_file file; if not run RealignerTargetCreator realign_targets_file = os.path.join(sample_dir, "realign.target.intervals") needs_rebuild = utils.target_needs_rebuild([input_file, bam_index_file], realign_targets_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Realign targets file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params: gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir # Set the number of threads to use utils.configure_process_threads("RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"], num_threads, None) realigner_target_creator_extra_params= os.environ["RealignerTargetCreator_ExtraParams"] command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T RealignerTargetCreator -R " + reference_file_path + " -I " + input_file + " -o " + realign_targets_file + ' ' + realigner_target_creator_extra_params verbose_print("# Identify targets for realignment.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % gatk_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(realign_targets_file, "GATK RealignerTargetCreator", empty_ok=True) verbose_print("") #========================================================================== # Realign around indels #========================================================================== output_file = utils.add_file_suffix(input_file, ".indelrealigned", enable=enable_local_realignment) if enable_local_realignment: # Check for fresh indelrealigned bam file; if not run IndelRealigner needs_rebuild = utils.target_needs_rebuild([input_file, bam_index_file, realign_targets_file], output_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Indelrealigned bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params: gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir indel_realigner_extra_params = os.environ.get("IndelRealigner_ExtraParams") or "" command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T IndelRealigner -R " + reference_file_path + " -targetIntervals " + realign_targets_file + " -I " + input_file + " -o " + output_file + ' ' + indel_realigner_extra_params verbose_print("# Realign around indels") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % gatk_version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(output_file, "GATK IndelRealigner")