예제 #1
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sam
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir
    sam_file = os.path.join(sample_dir, "reads.sam")
    utils.verify_non_empty_input_files("Sample SAM file", [sam_file],
                                       error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")

        # Substitute the default parameters if the user did not specify samtools view parameters
        samtools_samfilter_params = os.environ.get(
            "SamtoolsSamFilter_ExtraParams") or "-F 4"
        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print(
            "# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file],
                                               sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_sort_extra_params = os.environ.get(
            "SamtoolsSort_ExtraParams") or ""

        # Inspect the samtools version to determine how to execute samtools
        # Use the -o FILE command line option with SAMtools 1.3 and higher
        samtools_version = version_str.split()[-1]  # just the number
        if samtools_version < "1.3":
            command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(
                sample_dir, "reads.sorted")
        else:
            command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file

        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        # Check for fresh deduped bam file; if not, remove duplicate reads
        deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        needs_rebuild = utils.target_needs_rebuild([sorted_bam_file],
                                                   deduped_bam_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            if not classpath or "picard" not in classpath.lower():
                utils.global_error(
                    "Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable."
                )
            else:
                version_str = utils.extract_version_str(
                    "Picard",
                    "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1"
                )
                picard_jvm_extra_params = os.environ.get(
                    "PicardJvm_ExtraParams") or ""
                picard_mark_duplicates_extra_params = os.environ.get(
                    "PicardMarkDuplicates_ExtraParams") or ""
                tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
                tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
                command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(
                    sample_dir, "duplicate_reads_metrics.txt"
                ) + tmp_option + ' ' + picard_mark_duplicates_extra_params
                verbose_print("# Remove duplicate reads from bam file.")
                verbose_print("# %s %s" % (utils.timestamp(), command_line))
                verbose_print("# %s" % version_str)
                command.run(command_line, sys.stdout)
                utils.sample_error_on_missing_file(deduped_bam_file,
                                                   "picard MarkDuplicates")
                verbose_print("")
        pileup_input_file = deduped_bam_file
    else:
        pileup_input_file = sorted_bam_file

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild(
        [pileup_input_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get(
            "SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# VCF file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        classpath = os.environ.get("CLASSPATH")
        if not classpath or "varscan" not in classpath.lower():
            utils.global_error(
                "Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable."
            )
        else:
            version_str = utils.extract_version_str(
                "VarScan",
                "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2"
            )
            varscan_jvm_extra_params = os.environ.get(
                "VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get(
                "VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError",
                                                "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient",
                                                "VarScan")
예제 #2
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sorted.deduped.indelrealigned.bam
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads",
                                            "true").lower() == "true"
    enable_local_realignment = os.environ.get("EnableLocalRealignment",
                                              "true").lower() == "true"

    input_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    input_bam_file = utils.add_file_suffix(input_bam_file,
                                           ".deduped",
                                           enable=remove_duplicate_reads)
    input_bam_file = utils.add_file_suffix(input_bam_file,
                                           ".indelrealigned",
                                           enable=enable_local_realignment)

    utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file],
                                       error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild(
        [input_bam_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get(
            "SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# VCF file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH")
        if not jar_file_path:
            utils.global_error(
                "Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable."
            )
        else:
            version_str = utils.extract_version_str(
                "VarScan", "java -jar " + jar_file_path +
                " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get(
                "VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get(
                "VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError",
                                                "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient",
                                                "VarScan")
예제 #3
0
def collect_metrics(args):
    """Collect the quality metrics and SNP metrics for a sample.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/*.fastq.gz
                sample_name_one/reads.sam
                sample_name_one/reads.sorted.deduped.bam
                sample_name_one/reads.sorted.bam
                sample_name_one/reads.all.pileup
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_preserved.vcf
                sample_name_one/consensus.fasta
                sample_name_one/consensus_preserved.fasta
                sample_name_one/consensus.vcf
                sample_name_one/consensus_preserved.vcf
                sample_name_one/metrics*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
        consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory
        consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory
        consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory
        consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory
        maxSnps : Maximum allowed number of SNPs per sample
        metricsFile : Output file.  Relative or absolute path to the metrics file
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir
    utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False)

    metrics_file_path = args.metricsFile
    max_allowed_snps = args.maxSnps
    consensus_vcf_file_name = args.consensusVcfFileName
    consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName
    consensus_fasta_file_name = args.consensusFastaFileName
    consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Read existing metrics file so some metrics can be reused
    #==========================================================================
    try:
        metrics = utils.read_properties(metrics_file_path)
    except IOError:
        metrics = dict()

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header"))
    #-------------------------
    machine = ""
    flowcell = ""
    fastq_files = fastq.list_fastq_files(sample_dir)
    fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks
    if not fastq_files:
        handle_error("No fastq files were found.")
    else:
        tags = fastq.extract_metadata_tags(fastq_files[0])
        if tags:
            machine = tags.instrument or ""
            flowcell = tags.flow_cell or ""

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files"))
    #-------------------------
    fastq_file_size = ""
    fastq_file_list = ""
    if fastq_files:
        fastq_file_size = sum([os.path.getsize(file) for file in fastq_files])

    # Make a comma separated list of just the fastq file names without directories
    fastq_file_list = [os.path.basename(file) for file in fastq_files]
    fastq_file_list = ", ".join(fastq_file_list)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of reads and %mapped from sam file"))
    #-------------------------
    num_reads = ""
    percent_reads_mapped = ""
    file = os.path.join(sample_dir, "reads.sam")
    if verify_input_file("SAM file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            num_reads = metrics.get("numberReads", "") # reuse already fresh metrics
            percent_reads_mapped = metrics.get("percentReadsMapped", "") # reuse already fresh metrics
        if num_reads and percent_reads_mapped:
            verbose_print("Reusing previously calculated number of reads and %mapped")
        else:
            num_reads = command.run("samtools view -S -c " + file)
            num_reads = num_reads.strip()
            mapped = command.run("samtools view -S -c -F 4 " + file)
            mapped = mapped.strip()
            try:
                percent_reads_mapped = 100.0 * float(mapped) / float(num_reads)
                percent_reads_mapped = "%.2f" % percent_reads_mapped
            except ValueError:
                handle_error("Cannot calculate number of reads and %mapped.")

    #-------------------------
    # Calculate number of duplicate reads from deduped bam file
    #-------------------------
    num_dup_reads = ""
    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file"))
        file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        if verify_input_file("Deduped BAM file", file):
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics
            if num_dup_reads:
                verbose_print("Reusing previously calculated number of duplicate reads")
            else:
                num_dup_reads = command.run("samtools view -S -c -f 1024 " + file)
                num_dup_reads = num_dup_reads.strip()

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean insert size from bam file"))
    #-------------------------
    ave_insert_size = ""
    file = os.path.join(sample_dir, "reads.sorted.bam")
    if verify_input_file("BAM file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics
        if ave_insert_size:
            verbose_print("Reusing previously calculated mean insert size")
        else:
            # Extract inferred insert sizes (TLEN, column 9 of BAM file) for reads "mapped in proper pair" (2) and "first in pair" (64) = 66
            tempfile = NamedTemporaryFile(delete=False, dir=sample_dir, prefix="tmp.inserts.", mode='w')
            command.run("samtools view -f 66 " + file + " | cut -f 9 | sed 's/^-//'", tempfile.name)
            insert_count = 0
            insert_sum = 0
            with open(tempfile.name) as f:
                for line in f:
                    try:
                        insert_sum += int(line)
                        insert_count += 1
                    except ValueError:
                        pass
            os.unlink(tempfile.name)
            if insert_count > 0 and insert_sum > 0:
                ave_insert_size = float(insert_sum) / float(insert_count)
                ave_insert_size = "%.2f" % ave_insert_size
            else:
                handle_error("Cannot calculate mean insert size.")

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file"))
    #-------------------------
    ave_pileup_depth = ""
    file = os.path.join(sample_dir, "reads.all.pileup")
    if verify_input_file("Pileup file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics
        if ave_pileup_depth:
            verbose_print("Reusing previously calculated mean pileup depth")
        else:
            depth_sum = 0
            with open(file) as f:
                for line in f:
                    tokens = line.split()
                    try:
                        depth_sum += int(tokens[3])
                    except (ValueError, IndexError):
                        pass
            reference_length = 0
            for record in SeqIO.parse(reference_file_path, "fasta"):
                reference_length += len(record)
            if depth_sum > 0 and reference_length > 0:
                #print("depth_sum=%i" % depth_sum);
                #print("reference_length=%i" % reference_length)
                ave_pileup_depth = float(depth_sum) / float(reference_length)
                ave_pileup_depth = "%.2f" % ave_pileup_depth
            else:
                handle_error("Cannot calculate mean pileup depth.")

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of high confidence SNP positions from phase 1 vcf file"))
    #-------------------------
    phase1_snps = ""
    excluded_sample = ""
    file = os.path.join(sample_dir, "var.flt.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics
        if phase1_snps:
            verbose_print("Reusing previously calculated phase1 snps")
        else:
            phase1_snps = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps > max_allowed_snps:
            excluded_sample = "Excluded"
            handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps)
        phase1_snps = str(phase1_snps)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file"))
    #-------------------------
    phase1_snps_preserved = ""
    excluded_sample_preserved = ""
    file = os.path.join(sample_dir, "var.flt_preserved.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps_preserved = metrics.get("phase1SnpsPreserved", "") # reuse already fresh metrics
        if phase1_snps_preserved:
            verbose_print("Reusing previously calculated preserved phase1 snps")
        else:
            phase1_snps_preserved = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps:
            excluded_sample_preserved = "Excluded"
            handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps)
        phase1_snps_preserved = str(phase1_snps_preserved)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps = ""
    file = os.path.join(sample_dir, consensus_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps = metrics.get("snps", "") # reuse already fresh metrics
            if phase2_snps:
                verbose_print("Reusing previously calculated phase2 snps")
            else:
                phase2_snps = count_vcf_file_snps(file)
                phase2_snps = str(phase2_snps)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps_preserved = metrics.get("snpsPreserved", "") # reuse already fresh metrics
            if phase2_snps_preserved:
                verbose_print("Reusing previously calculated preserved phase2 snps")
            else:
                phase2_snps_preserved = count_vcf_file_snps(file)
                phase2_snps_preserved = str(phase2_snps_preserved)

    #------------------------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix"))
    #------------------------------------------
    missing_pos = ""
    file = os.path.join(sample_dir, consensus_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics
            if missing_pos:
                verbose_print("Reusing previously calculated missing positions")
            else:
                missing_pos = count_missing_snp_matrix_positions(file, sample_id)
                missing_pos = str(missing_pos)

    #------------------------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix"))
    #------------------------------------------
    missing_pos_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos_preserved = metrics.get("missingPosPreserved", "") # reuse already fresh metrics
            if missing_pos_preserved:
                verbose_print("Reusing previously calculated missing positions")
            else:
                missing_pos_preserved = count_missing_snp_matrix_positions(file, sample_id)
                missing_pos_preserved = str(missing_pos_preserved)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Print results"))
    #-------------------------
    with open(metrics_file_path, "w") as f:
        print("sample=" + '"' + sample_id + '"', file=f)
        print("fastqFileList=" + '"' + fastq_file_list + '"', file=f)
        print("fastqFileSize=" + str(fastq_file_size), file=f)
        print("machine=" + machine, file=f)
        print("flowcell=" + flowcell, file=f)
        print("numberReads=" + num_reads, file=f)
        print("numberDupReads=" + num_dup_reads, file=f)
        print("percentReadsMapped=" + percent_reads_mapped, file=f)
        print("aveInsertSize=" + ave_insert_size, file=f)
        print("avePileupDepth=" + ave_pileup_depth, file=f)
        print("phase1Snps=" + phase1_snps, file=f)
        print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f)
        print("snps=" + phase2_snps, file=f)
        print("snpsPreserved=" + phase2_snps_preserved, file=f)
        print("missingPos=" + missing_pos, file=f)
        print("missingPosPreserved=" + missing_pos_preserved, file=f)
        print("excludedSample=" + excluded_sample, file=f)
        print("excludedSamplePreserved=" + excluded_sample_preserved, file=f)
        print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
예제 #4
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sam
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir
    sam_file = os.path.join(sample_dir, "reads.sam")
    utils.verify_non_empty_input_files("Sample SAM file", [sam_file], error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")

        # Substitute the default parameters if the user did not specify samtools view parameters
        samtools_samfilter_params = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4"
        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print("# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_sort_extra_params = os.environ.get("SamtoolsSort_ExtraParams") or ""

        # Inspect the samtools version to determine how to execute samtools
        # Use the -o FILE command line option with SAMtools 1.3 and higher
        samtools_version = version_str.split()[-1] # just the number
        if samtools_version < "1.3":
            command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(sample_dir, "reads.sorted")
        else:
            command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file

        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        # Check for fresh deduped bam file; if not, remove duplicate reads
        deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        needs_rebuild = utils.target_needs_rebuild([sorted_bam_file], deduped_bam_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            if not classpath or "picard" not in classpath.lower():
                utils.global_error("Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable.")
            else:
                version_str = utils.extract_version_str("Picard", "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1")
                picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or ""
                picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or ""
                tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
                tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
                command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params
                verbose_print("# Remove duplicate reads from bam file.")
                verbose_print("# %s %s" % (utils.timestamp(), command_line))
                verbose_print("# %s" % version_str)
                command.run(command_line, sys.stdout)
                utils.sample_error_on_missing_file(deduped_bam_file, "picard MarkDuplicates")
                verbose_print("")
        pileup_input_file = deduped_bam_file
    else:
        pileup_input_file = sorted_bam_file

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild([pileup_input_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# VCF file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        classpath = os.environ.get("CLASSPATH")
        if not classpath or "varscan" not in classpath.lower():
            utils.global_error("Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable.")
        else:
            version_str = utils.extract_version_str("VarScan", "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
예제 #5
0
def collect_metrics(args):
    """Collect the quality metrics and SNP metrics for a sample.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/*.fastq.gz
                sample_name_one/reads.sam
                sample_name_one/reads.sorted.deduped.bam
                sample_name_one/reads.sorted.bam
                sample_name_one/reads.all.pileup
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_preserved.vcf
                sample_name_one/consensus.fasta
                sample_name_one/consensus_preserved.fasta
                sample_name_one/consensus.vcf
                sample_name_one/consensus_preserved.vcf
                sample_name_one/metrics*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
        consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory
        consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory
        consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory
        consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory
        maxSnps : Maximum allowed number of SNPs per sample
        metricsFile : Output file.  Relative or absolute path to the metrics file
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir
    utils.verify_non_empty_directory("Sample directory",
                                     sample_dir,
                                     error_handler="sample",
                                     continue_possible=False)

    metrics_file_path = args.metricsFile
    max_allowed_snps = args.maxSnps
    consensus_vcf_file_name = args.consensusVcfFileName
    consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName
    consensus_fasta_file_name = args.consensusFastaFileName
    consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Read existing metrics file so some metrics can be reused
    #==========================================================================
    try:
        metrics = utils.read_properties(metrics_file_path)
    except IOError:
        metrics = dict()

    #-------------------------
    verbose_print(
        "# %s %s" %
        (utils.timestamp(), "Get machine and flowcell from fastq header"))
    #-------------------------
    machine = ""
    flowcell = ""
    fastq_files = fastq.list_fastq_files(sample_dir)
    fastq_files = [f for f in fastq_files
                   if os.path.isfile(f)]  # Exclude broken symlinks
    if not fastq_files:
        handle_error("No fastq files were found.")
    else:
        tags = fastq.extract_metadata_tags(fastq_files[0])
        if tags:
            machine = tags.instrument or ""
            flowcell = tags.flow_cell or ""

    #-------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(), "Sum file sizes of paired fastq files"))
    #-------------------------
    fastq_file_size = ""
    fastq_file_list = ""
    if fastq_files:
        fastq_file_size = sum([os.path.getsize(file) for file in fastq_files])

    # Make a comma separated list of just the fastq file names without directories
    fastq_file_list = [os.path.basename(file) for file in fastq_files]
    fastq_file_list = ", ".join(fastq_file_list)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(
    ), "Calculate number of reads, %mapped, %proper pair, and ave insert size from sam file"
                               ))
    #-------------------------
    num_reads = ""
    percent_reads_mapped = ""
    percent_proper_pair = ""
    ave_insert_size = ""
    file = os.path.join(sample_dir, "reads.sam")
    if verify_input_file("SAM file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            num_reads = metrics.get("numberReads",
                                    "")  # reuse already fresh metrics
            percent_reads_mapped = metrics.get(
                "percentReadsMapped", "")  # reuse already fresh metrics
            percent_proper_pair = metrics.get(
                "percentProperPair", "")  # reuse already fresh metrics
            ave_insert_size = metrics.get("aveInsertSize",
                                          "")  # reuse already fresh metrics
        missing_any_metrics = not all([
            num_reads, percent_reads_mapped, percent_proper_pair,
            ave_insert_size
        ])
        if not missing_any_metrics:
            verbose_print(
                "Reusing previously calculated number of reads, %mapped, %proper pair, and ave insert size"
            )
        else:
            tempfile_path = os.path.join(sample_dir, "tmp.sam.stats")
            try:
                command.run("samtools stats " + file, tempfile_path)
            except subprocess.CalledProcessError:
                pass  # the error message has already been printed to stderr
            with open(tempfile_path) as f:
                for line in f:
                    lower_line = line.lower()
                    split_line = line.strip().split('\t')
                    if "raw total sequences:" in lower_line:
                        num_reads = split_line[2]
                        continue
                    if "reads mapped:" in lower_line:
                        reads_mapped = split_line[2]
                        try:
                            percent_reads_mapped = 100.0 * float(
                                reads_mapped) / float(num_reads)
                            percent_reads_mapped = "%.2f" % percent_reads_mapped
                        except ValueError:
                            percent_reads_mapped = ""
                        continue
                    if "reads properly paired:" in lower_line:
                        proper_pairs = split_line[2]
                        try:
                            percent_proper_pair = 100.0 * float(
                                proper_pairs) / float(num_reads)
                            percent_proper_pair = "%.2f" % percent_proper_pair
                        except ValueError:
                            percent_proper_pair = ""
                        continue
                    if "insert size average:" in lower_line:
                        ave_insert_size = split_line[2]
                        continue
            os.unlink(tempfile_path)
            missing_any_metrics = not all([
                num_reads, percent_reads_mapped, percent_proper_pair,
                ave_insert_size
            ])
            if missing_any_metrics:
                missing_list = []
                if not num_reads:
                    missing_list.append("number of reads")
                if not percent_reads_mapped:
                    missing_list.append("percent reads mapped")
                if not percent_proper_pair:
                    missing_list.append("percent proper pair")
                if not ave_insert_size:
                    missing_list.append("ave insert size")
                error_text = "Cannot calculate " + ", ".join(
                    missing_list) + '.'
                handle_error(error_text)

    #-------------------------
    # Calculate number of duplicate reads from deduped bam file
    #-------------------------
    num_dup_reads = ""
    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        verbose_print(
            "# %s %s" %
            (utils.timestamp(),
             "Calculate number of duplicate reads from deduped bam file"))
        file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        if verify_input_file("Deduped BAM file", file):
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                num_dup_reads = metrics.get("numberDupReads",
                                            "")  # reuse already fresh metrics
            if num_dup_reads:
                verbose_print(
                    "Reusing previously calculated number of duplicate reads")
            else:
                num_dup_reads = command.run("samtools view -S -c -f 1024 " +
                                            file)
                num_dup_reads = num_dup_reads.strip()

    #-------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(), "Calculate mean depth from pileup file"))
    #-------------------------
    ave_pileup_depth = ""
    file = os.path.join(sample_dir, "reads.all.pileup")
    if verify_input_file("Pileup file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            ave_pileup_depth = metrics.get("avePileupDepth",
                                           "")  # reuse already fresh metrics
        if ave_pileup_depth:
            verbose_print("Reusing previously calculated mean pileup depth")
        else:
            depth_sum = 0
            with open(file) as f:
                for line in f:
                    tokens = line.split()
                    try:
                        depth_sum += int(tokens[3])
                    except (ValueError, IndexError):
                        pass
            reference_length = 0
            for record in SeqIO.parse(reference_file_path, "fasta"):
                reference_length += len(record)
            if depth_sum > 0 and reference_length > 0:
                #print("depth_sum=%i" % depth_sum);
                #print("reference_length=%i" % reference_length)
                ave_pileup_depth = float(depth_sum) / float(reference_length)
                ave_pileup_depth = "%.2f" % ave_pileup_depth
            else:
                handle_error("Cannot calculate mean pileup depth.")

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(
    ), "Count number of high confidence SNP positions from phase 1 vcf file"))
    #-------------------------
    phase1_snps = ""
    excluded_sample = ""
    file = os.path.join(sample_dir, "var.flt.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps = metrics.get("phase1Snps",
                                      "")  # reuse already fresh metrics
        if phase1_snps:
            verbose_print("Reusing previously calculated phase1 snps")
        else:
            phase1_snps = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps > max_allowed_snps:
            excluded_sample = "Excluded"
            handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps)
        phase1_snps = str(phase1_snps)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(
    ), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file"
                               ))
    #-------------------------
    phase1_snps_preserved = ""
    excluded_sample_preserved = ""
    file = os.path.join(sample_dir, "var.flt_preserved.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps_preserved = metrics.get(
                "phase1SnpsPreserved", "")  # reuse already fresh metrics
        if phase1_snps_preserved:
            verbose_print(
                "Reusing previously calculated preserved phase1 snps")
        else:
            phase1_snps_preserved = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps:
            excluded_sample_preserved = "Excluded"
            handle_error("Excluded: preserved exceeded %i maxsnps." %
                         max_allowed_snps)
        phase1_snps_preserved = str(phase1_snps_preserved)

    #-------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(),
                   "Count number of consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps = ""
    file = os.path.join(sample_dir, consensus_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps = metrics.get("snps",
                                          "")  # reuse already fresh metrics
            if phase2_snps:
                verbose_print("Reusing previously calculated phase2 snps")
            else:
                phase2_snps = count_vcf_file_snps(file)
                phase2_snps = str(phase2_snps)

    #-------------------------
    verbose_print(
        "# %s %s" %
        (utils.timestamp(),
         "Count number of preserved consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps_preserved = metrics.get(
                    "snpsPreserved", "")  # reuse already fresh metrics
            if phase2_snps_preserved:
                verbose_print(
                    "Reusing previously calculated preserved phase2 snps")
            else:
                phase2_snps_preserved = count_vcf_file_snps(file)
                phase2_snps_preserved = str(phase2_snps_preserved)

    #------------------------------------------
    verbose_print(
        "# %s %s" %
        (utils.timestamp(), "Count missing positions in the snp matrix"))
    #------------------------------------------
    missing_pos = ""
    file = os.path.join(sample_dir, consensus_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos = metrics.get("missingPos",
                                          "")  # reuse already fresh metrics
            if missing_pos:
                verbose_print(
                    "Reusing previously calculated missing positions")
            else:
                missing_pos = count_missing_snp_matrix_positions(
                    file, sample_id)
                missing_pos = str(missing_pos)

    #------------------------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(),
                   "Count missing positions in the preserved snp matrix"))
    #------------------------------------------
    missing_pos_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos_preserved = metrics.get(
                    "missingPosPreserved", "")  # reuse already fresh metrics
            if missing_pos_preserved:
                verbose_print(
                    "Reusing previously calculated missing positions")
            else:
                missing_pos_preserved = count_missing_snp_matrix_positions(
                    file, sample_id)
                missing_pos_preserved = str(missing_pos_preserved)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Print results"))
    #-------------------------
    with open(metrics_file_path, "w") as f:
        print("sample=" + '"' + sample_id + '"', file=f)
        print("fastqFileList=" + '"' + fastq_file_list + '"', file=f)
        print("fastqFileSize=" + str(fastq_file_size), file=f)
        print("machine=" + machine, file=f)
        print("flowcell=" + flowcell, file=f)
        print("numberReads=" + num_reads, file=f)
        print("numberDupReads=" + num_dup_reads, file=f)
        print("percentReadsMapped=" + percent_reads_mapped, file=f)
        print("percentProperPair=" + percent_proper_pair, file=f)
        print("aveInsertSize=" + ave_insert_size, file=f)
        print("avePileupDepth=" + ave_pileup_depth, file=f)
        print("phase1Snps=" + phase1_snps, file=f)
        print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f)
        print("snps=" + phase2_snps, file=f)
        print("snpsPreserved=" + phase2_snps_preserved, file=f)
        print("missingPos=" + missing_pos, file=f)
        print("missingPosPreserved=" + missing_pos_preserved, file=f)
        print("excludedSample=" + excluded_sample, file=f)
        print("excludedSamplePreserved=" + excluded_sample_preserved, file=f)
        print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sorted.deduped.indelrealigned.bam
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true"
    enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true"

    input_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    input_bam_file = utils.add_file_suffix(input_bam_file, ".deduped", enable=remove_duplicate_reads)
    input_bam_file = utils.add_file_suffix(input_bam_file, ".indelrealigned", enable=enable_local_realignment)

    utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file], error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild([input_bam_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# VCF file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH")
        if not jar_file_path:
            utils.global_error("Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable.")
        else:
            version_str = utils.extract_version_str("VarScan", "java -jar " + jar_file_path + " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")