def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sam sample_name_one/reads.unsorted.bam* sample_name_one/reads.sorted.bam* sample_name_one/reads.sorted.deduped.bam* sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir sam_file = os.path.join(sample_dir, "reads.sam") utils.verify_non_empty_input_files("Sample SAM file", [sam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Convert sam to bam file, selecting only the mapped reads #========================================================================== # Check for fresh bam file; if not, convert to bam file with only mapped reads unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam") needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Unsorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") # Substitute the default parameters if the user did not specify samtools view parameters samtools_samfilter_params = os.environ.get( "SamtoolsSamFilter_ExtraParams") or "-F 4" command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file verbose_print( "# Convert sam file to bam file with only mapped positions.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view") verbose_print("") #========================================================================== # Sort the BAM file #========================================================================== # Check for fresh sorted bam file; if not, sort it sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam") needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Sorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_sort_extra_params = os.environ.get( "SamtoolsSort_ExtraParams") or "" # Inspect the samtools version to determine how to execute samtools # Use the -o FILE command line option with SAMtools 1.3 and higher samtools_version = version_str.split()[-1] # just the number if samtools_version < "1.3": command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join( sample_dir, "reads.sorted") else: command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file verbose_print("# Convert bam to sorted bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort") verbose_print("") #========================================================================== # Mark duplicate reads, so they will be ignored in subsequent steps #========================================================================== remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": # Check for fresh deduped bam file; if not, remove duplicate reads deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam") needs_rebuild = utils.target_needs_rebuild([sorted_bam_file], deduped_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Deduped bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "picard" not in classpath.lower(): utils.global_error( "Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable." ) else: version_str = utils.extract_version_str( "Picard", "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1" ) picard_jvm_extra_params = os.environ.get( "PicardJvm_ExtraParams") or "" picard_mark_duplicates_extra_params = os.environ.get( "PicardMarkDuplicates_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") tmp_option = " TMP_DIR=" + tmpdir if tmpdir else "" command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join( sample_dir, "duplicate_reads_metrics.txt" ) + tmp_option + ' ' + picard_mark_duplicates_extra_params verbose_print("# Remove duplicate reads from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(deduped_bam_file, "picard MarkDuplicates") verbose_print("") pileup_input_file = deduped_bam_file else: pileup_input_file = sorted_bam_file #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild( [pileup_input_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get( "SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "varscan" not in classpath.lower(): utils.global_error( "Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable." ) else: version_str = utils.extract_version_str( "VarScan", "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2" ) varscan_jvm_extra_params = os.environ.get( "VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get( "VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sorted.deduped.indelrealigned.bam sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true" enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true" input_bam_file = os.path.join(sample_dir, "reads.sorted.bam") input_bam_file = utils.add_file_suffix(input_bam_file, ".deduped", enable=remove_duplicate_reads) input_bam_file = utils.add_file_suffix(input_bam_file, ".indelrealigned", enable=enable_local_realignment) utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild( [input_bam_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get( "SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print( "# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH") if not jar_file_path: utils.global_error( "Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable." ) else: version_str = utils.extract_version_str( "VarScan", "java -jar " + jar_file_path + " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2") varscan_jvm_extra_params = os.environ.get( "VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get( "VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of reads and %mapped from sam file")) #------------------------- num_reads = "" percent_reads_mapped = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get("percentReadsMapped", "") # reuse already fresh metrics if num_reads and percent_reads_mapped: verbose_print("Reusing previously calculated number of reads and %mapped") else: num_reads = command.run("samtools view -S -c " + file) num_reads = num_reads.strip() mapped = command.run("samtools view -S -c -F 4 " + file) mapped = mapped.strip() try: percent_reads_mapped = 100.0 * float(mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: handle_error("Cannot calculate number of reads and %mapped.") #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print("Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean insert size from bam file")) #------------------------- ave_insert_size = "" file = os.path.join(sample_dir, "reads.sorted.bam") if verify_input_file("BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics if ave_insert_size: verbose_print("Reusing previously calculated mean insert size") else: # Extract inferred insert sizes (TLEN, column 9 of BAM file) for reads "mapped in proper pair" (2) and "first in pair" (64) = 66 tempfile = NamedTemporaryFile(delete=False, dir=sample_dir, prefix="tmp.inserts.", mode='w') command.run("samtools view -f 66 " + file + " | cut -f 9 | sed 's/^-//'", tempfile.name) insert_count = 0 insert_sum = 0 with open(tempfile.name) as f: for line in f: try: insert_sum += int(line) insert_count += 1 except ValueError: pass os.unlink(tempfile.name) if insert_count > 0 and insert_sum > 0: ave_insert_size = float(insert_sum) / float(insert_count) ave_insert_size = "%.2f" % ave_insert_size else: handle_error("Cannot calculate mean insert size.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get("phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print("Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get("snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print("Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print("Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions(file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get("missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print("Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions(file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sam sample_name_one/reads.unsorted.bam* sample_name_one/reads.sorted.bam* sample_name_one/reads.sorted.deduped.bam* sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir sam_file = os.path.join(sample_dir, "reads.sam") utils.verify_non_empty_input_files("Sample SAM file", [sam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Convert sam to bam file, selecting only the mapped reads #========================================================================== # Check for fresh bam file; if not, convert to bam file with only mapped reads unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam") needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Unsorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") # Substitute the default parameters if the user did not specify samtools view parameters samtools_samfilter_params = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4" command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file verbose_print("# Convert sam file to bam file with only mapped positions.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view") verbose_print("") #========================================================================== # Sort the BAM file #========================================================================== # Check for fresh sorted bam file; if not, sort it sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam") needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Sorted bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_sort_extra_params = os.environ.get("SamtoolsSort_ExtraParams") or "" # Inspect the samtools version to determine how to execute samtools # Use the -o FILE command line option with SAMtools 1.3 and higher samtools_version = version_str.split()[-1] # just the number if samtools_version < "1.3": command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(sample_dir, "reads.sorted") else: command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file verbose_print("# Convert bam to sorted bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort") verbose_print("") #========================================================================== # Mark duplicate reads, so they will be ignored in subsequent steps #========================================================================== remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": # Check for fresh deduped bam file; if not, remove duplicate reads deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam") needs_rebuild = utils.target_needs_rebuild([sorted_bam_file], deduped_bam_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Deduped bam file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "picard" not in classpath.lower(): utils.global_error("Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable.") else: version_str = utils.extract_version_str("Picard", "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1") picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or "" picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or "" tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR") tmp_option = " TMP_DIR=" + tmpdir if tmpdir else "" command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params verbose_print("# Remove duplicate reads from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, sys.stdout) utils.sample_error_on_missing_file(deduped_bam_file, "picard MarkDuplicates") verbose_print("") pileup_input_file = deduped_bam_file else: pileup_input_file = sorted_bam_file #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild([pileup_input_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print("# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: classpath = os.environ.get("CLASSPATH") if not classpath or "varscan" not in classpath.lower(): utils.global_error("Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable.") else: version_str = utils.extract_version_str("VarScan", "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2") varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Calculate number of reads, %mapped, %proper pair, and ave insert size from sam file" )) #------------------------- num_reads = "" percent_reads_mapped = "" percent_proper_pair = "" ave_insert_size = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get( "percentReadsMapped", "") # reuse already fresh metrics percent_proper_pair = metrics.get( "percentProperPair", "") # reuse already fresh metrics ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if not missing_any_metrics: verbose_print( "Reusing previously calculated number of reads, %mapped, %proper pair, and ave insert size" ) else: tempfile_path = os.path.join(sample_dir, "tmp.sam.stats") try: command.run("samtools stats " + file, tempfile_path) except subprocess.CalledProcessError: pass # the error message has already been printed to stderr with open(tempfile_path) as f: for line in f: lower_line = line.lower() split_line = line.strip().split('\t') if "raw total sequences:" in lower_line: num_reads = split_line[2] continue if "reads mapped:" in lower_line: reads_mapped = split_line[2] try: percent_reads_mapped = 100.0 * float( reads_mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: percent_reads_mapped = "" continue if "reads properly paired:" in lower_line: proper_pairs = split_line[2] try: percent_proper_pair = 100.0 * float( proper_pairs) / float(num_reads) percent_proper_pair = "%.2f" % percent_proper_pair except ValueError: percent_proper_pair = "" continue if "insert size average:" in lower_line: ave_insert_size = split_line[2] continue os.unlink(tempfile_path) missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if missing_any_metrics: missing_list = [] if not num_reads: missing_list.append("number of reads") if not percent_reads_mapped: missing_list.append("percent reads mapped") if not percent_proper_pair: missing_list.append("percent proper pair") if not ave_insert_size: missing_list.append("ave insert size") error_text = "Cannot calculate " + ", ".join( missing_list) + '.' handle_error(error_text) #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print( "# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print( "Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file" )) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get( "phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print( "Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get( "snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print( "Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print( "# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print( "Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions( file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get( "missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print( "Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions( file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("percentProperPair=" + percent_proper_pair, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def call_sites(args): """Find the sites with SNPs in a sample. The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and snps are called. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/reads.sorted.deduped.indelrealigned.bam sample_name_one/reads.all.pileup* sample_name_one/var.flt.vcf* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true" enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true" input_bam_file = os.path.join(sample_dir, "reads.sorted.bam") input_bam_file = utils.add_file_suffix(input_bam_file, ".deduped", enable=remove_duplicate_reads) input_bam_file = utils.add_file_suffix(input_bam_file, ".indelrealigned", enable=enable_local_realignment) utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file], error_handler="sample") sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Create the pileup file #========================================================================== # Check for fresh pileup; if not, create it pileup_file = os.path.join(sample_dir, "reads.all.pileup") needs_rebuild = utils.target_needs_rebuild([input_bam_file, reference_file_path], pileup_file) if not args.forceFlag and not needs_rebuild: verbose_print("# Pileup file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or "" command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file verbose_print("# Create pileup from bam file.") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, pileup_file) utils.sample_error_on_missing_file(pileup_file, "samtools mpileup") verbose_print("") #========================================================================== # Find the sites with SNPs #========================================================================== # Check for fresh unfiltered vcf; if not, create it vcf_file = os.path.join(sample_dir, "var.flt.vcf") needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file) if not args.forceFlag and not needs_rebuild: verbose_print("# VCF file is already freshly created for %s. Use the -f option to force a rebuild." % sample_id) else: jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH") if not jar_file_path: utils.global_error("Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable.") else: version_str = utils.extract_version_str("VarScan", "java -jar " + jar_file_path + " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2") varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or "" varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or "" command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params verbose_print("# Create vcf file") verbose_print("# %s %s" % (utils.timestamp(), command_line)) verbose_print("# %s" % version_str) command.run(command_line, vcf_file) utils.sample_error_on_missing_file(vcf_file, "VarScan") utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan") utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")