Пример #1
0
def extract_version_str(program_name, command_line):
    """Run a program with options to emit the version and construct
    a string with the program name a version.

    Parameters
    ----------
    program_name : str
        Friendly program name -- this will be returned in the version string
    command_line : str
        Command to be executed to get the version somewhere in the output

    Returns
    -------
    version_str : str
        A version string of the form "program_name version 2.3.0" or
        "Unrecognized program_name version".
    """
    # Run the command to get the version, split and clean the output
    text = command.run(command_line)
    lines = text.split('\n')
    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line]

    # Look for an output line with the word "version"
    for line in lines:
        lowerline = line.lower()
        if "version" in lowerline:
            lowerline = lowerline.replace(':', ' ')
            tokens = lowerline.split()
            for index, token in enumerate(tokens):
                if token == "version" and len(tokens) > index + 1:
                    return program_name + " version " + tokens[index + 1]

    # if only one line and only one token, assume it is the version identifier
    if len(lines) == 1:
        tokens = lines[0].split()
        if len(tokens) == 1:
            return program_name + " version " + tokens[0]

    return "Unrecognized " + program_name + " version"
Пример #2
0
def extract_version_str(program_name, command_line):
    """Run a program with options to emit the version and construct
    a string with the program name a version.

    Parameters
    ----------
    program_name : str
        Friendly program name -- this will be returned in the version string
    command_line : str
        Command to be executed to get the version somewhere in the output

    Returns
    -------
    version_str : str
        A version string of the form "program_name version 2.3.0" or
        "Unrecognized program_name version".
    """
    # Run the command to get the version, split and clean the output
    text = command.run(command_line)
    lines = text.split('\n')
    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line]

    # Look for an output line with the word "version"
    for line in lines:
        lowerline = line.lower()
        if "version" in lowerline:
            lowerline = lowerline.replace(':', ' ')
            tokens = lowerline.split()
            for index, token in enumerate(tokens):
                if token == "version" and len(tokens) > index + 1:
                    return program_name + " version " + tokens[index + 1]

    # if only one line and only one token, assume it is the version identifier
    if len(lines) == 1:
        tokens = lines[0].split()
        if len(tokens) == 1:
            return program_name + " version " + tokens[0]

    return "Unrecognized " + program_name + " version"
Пример #3
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sorted.deduped.indelrealigned.bam
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads",
                                            "true").lower() == "true"
    enable_local_realignment = os.environ.get("EnableLocalRealignment",
                                              "true").lower() == "true"

    input_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    input_bam_file = utils.add_file_suffix(input_bam_file,
                                           ".deduped",
                                           enable=remove_duplicate_reads)
    input_bam_file = utils.add_file_suffix(input_bam_file,
                                           ".indelrealigned",
                                           enable=enable_local_realignment)

    utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file],
                                       error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild(
        [input_bam_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get(
            "SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# VCF file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH")
        if not jar_file_path:
            utils.global_error(
                "Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable."
            )
        else:
            version_str = utils.extract_version_str(
                "VarScan", "java -jar " + jar_file_path +
                " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get(
                "VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get(
                "VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError",
                                                "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient",
                                                "VarScan")
Пример #4
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*

    The reverse fastq file is optional.
    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error("Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[0] # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print("# %s has already been aligned to %s.  Use the -f option to force a rebuild." % (sample_id, reference_id))
        return

    #==========================================================================
    # Construct the command line to execute bowtie2 or smalt
    #==========================================================================

    # The read group identifies reads from a single run and lane
    read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1, sample_id)

    # Default to 8 cores on HPC or all cpu cores on workstation
    if os.environ.get("JOB_ID") or os.environ.get("PBS_JOBID"):
        num_cores = 8
    else:
        num_cores = psutil.cpu_count()

    num_cores_param = ""

    if snp_pipeline_aligner == "bowtie2":
        version_str = utils.extract_version_str("bowtie2", "bowtie2 --version")

        # Parse the user-specified bowtie parameters to determine if the user specified the number of CPU cores
        bowtie2_align_extra_params = os.environ.get("Bowtie2Align_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(bowtie2_align_extra_params, "-p"):
            num_cores_param = "-p " + str(num_cores)

        # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
        # The read group tags are used by some downstream tools, like Picard and GATK.
        read_group_params = ""
        if read_group_tags:
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

        # Substitute the default parameters if the user did not specify bowtie parameters
        bowtie2_align_params = bowtie2_align_extra_params or "--reorder -q"

        # Build the command with options depending on whether the fastq files are paired
        command_line = "bowtie2 " + num_cores_param + " " + read_group_params + " " + bowtie2_align_params + " -x " + reference_base_path
        if sample_fastq_file2:
            command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
        else:
            command_line += " -U " + sample_fastq_file1

    elif snp_pipeline_aligner == "smalt":
        version_str = utils.extract_version_str("smalt", "smalt version")

        # Parse the user-specified smalt parameters to determine if the user specified the number of CPU cores
        smalt_align_extra_params = os.environ.get("SmaltAlign_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(smalt_align_extra_params, "-n"):
            num_cores_param = "-n " + str(num_cores)

        # Substitute the default parameters if the user did not specify smalt parameters
        smalt_align_params = smalt_align_extra_params or "-O"

        # Don't use the -i 1000 option if the fastq file is unpaired
        if not sample_fastq_file2:
            smalt_align_params = re.sub("-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute

        command_line = "smalt map " + num_cores_param + " " + smalt_align_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (sample_fastq_file2 or "")

    #==========================================================================
    # Run the command to execute bowtie2 or smalt
    #==========================================================================
    verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id))
    verbose_print("# %s %s" % (utils.timestamp(), command_line))
    verbose_print("# %s" % version_str)
    command.run(command_line, sam_file)

    #==========================================================================
    # When using smalt, assign read groups in a separate step.
    # This is already done when using bowtie2.
    #==========================================================================
    if snp_pipeline_aligner == "smalt" and read_group_tags:
        smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
        shutil.move(sam_file, smalt_sam_file)
        version_str = utils.extract_version_str("Picard", "java  picard.cmdline.PicardCommandLine AddOrReplaceReadGroups --version 2>&1")
        jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
        command_line = "java " + jvm_params + " picard.cmdline.PicardCommandLine AddOrReplaceReadGroups"
        command_line += " I=" + smalt_sam_file
        command_line += " O=" + sam_file
        command_line += " RGID=" + read_group_tags.ID
        command_line += " RGSM=" + read_group_tags.SM
        command_line += " RGLB=" + read_group_tags.LB
        command_line += " RGPL=" + read_group_tags.PL
        command_line += " RGPU=" + read_group_tags.PU
        verbose_print("")
        verbose_print("# Assign read group id %s" % (read_group_tags.ID))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
Пример #5
0
def index_ref(args):
    """Index the reference genome.

    Execute an external program (bowtie2 or smalt) to create an index for the
    reference genome to be used during subsequent alignment.  Execute samtools
    to create the faidx index file to be used during subsequent pileups.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta         # input fasta
                referenceFile.#.bt2*        # bowtie2 output
                referenceFile.rev.#.bt2*    # bowtie2 output
                referenceFile.sma*          # smalt output
                referenceFile.smi*          # smalt output
                referenceFile.fasta.fai*    # samtools faidx output

    The input fasta file is created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    # Create index file for reference
    if snp_pipeline_aligner == "bowtie2":
        target_file = reference_base_path + ".rev.1.bt2"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Bowtie index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("bowtie2",
                                                    "bowtie2 --version")
            bowtie2_build_extra_params = os.environ.get(
                "Bowtie2Build_ExtraParams") or ""
            command_line = "bowtie2-build " + bowtie2_build_extra_params + ' ' + reference_file_path + ' ' + reference_base_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)
            utils.global_error_on_missing_file(target_file, "bowtie2-build")

    elif snp_pipeline_aligner == "smalt":
        target_file = reference_base_path + ".smi"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Smalt index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("smalt", "smalt version")
            smalt_index_extra_params = os.environ.get(
                "SmaltIndex_ExtraParams") or ""
            command_line = "smalt index " + smalt_index_extra_params + ' ' + reference_base_path + ' ' + reference_file_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)

    # Create the samtools fai index
    verbose_print("")
    target_file = reference_file_path + ".fai"
    needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                               target_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# SAMtools fai index %s is already freshly built.  Use the -f option to force a rebuild."
            % target_file)
    else:
        version_str = utils.extract_version_str("samtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_faidx_extra_params = os.environ.get(
            "SamtoolsFaidx_ExtraParams") or ""
        command_line = "samtools faidx " + samtools_faidx_extra_params + ' ' + reference_file_path
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.global_error_on_missing_file(target_file, "samtools faidx")

    # Create the reference dict file used later by GATK
    enable_local_realignment = os.environ.get("EnableLocalRealignment",
                                              "true").lower() == "true"
    if enable_local_realignment:
        verbose_print("")
        target_file = reference_base_path + ".dict"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Sequence dictionary %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            utils.remove_file(
                target_file
            )  # Need to delete existing output, if any, before running
            jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH")
            if not jar_file_path:
                utils.global_error(
                    "Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable."
                )
            version_str = utils.extract_version_str(
                "Picard", "java -jar " + jar_file_path +
                " CreateSequenceDictionary --version 2>&1")
            picard_jvm_extra_params = os.environ.get(
                "PicardJvm_ExtraParams") or ""
            picard_create_sequence_dictionary_extra_params = os.environ.get(
                "CreateSequenceDictionary_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
            command_line = "java " + picard_jvm_extra_params + " -jar " + jar_file_path + " CreateSequenceDictionary REFERENCE=" + reference_file_path + " OUTPUT=" + target_file + tmp_option + ' ' + picard_create_sequence_dictionary_extra_params
            verbose_print("# Create reference sequence dictionary.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(
                target_file, "picard CreateSequenceDictionary")
Пример #6
0
def collect_metrics(args):
    """Collect the quality metrics and SNP metrics for a sample.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/*.fastq.gz
                sample_name_one/reads.sam
                sample_name_one/reads.sorted.deduped.bam
                sample_name_one/reads.sorted.bam
                sample_name_one/reads.all.pileup
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_preserved.vcf
                sample_name_one/consensus.fasta
                sample_name_one/consensus_preserved.fasta
                sample_name_one/consensus.vcf
                sample_name_one/consensus_preserved.vcf
                sample_name_one/metrics*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
        consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory
        consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory
        consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory
        consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory
        maxSnps : Maximum allowed number of SNPs per sample
        metricsFile : Output file.  Relative or absolute path to the metrics file
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir
    utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False)

    metrics_file_path = args.metricsFile
    max_allowed_snps = args.maxSnps
    consensus_vcf_file_name = args.consensusVcfFileName
    consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName
    consensus_fasta_file_name = args.consensusFastaFileName
    consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Read existing metrics file so some metrics can be reused
    #==========================================================================
    try:
        metrics = utils.read_properties(metrics_file_path)
    except IOError:
        metrics = dict()

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header"))
    #-------------------------
    machine = ""
    flowcell = ""
    fastq_files = fastq.list_fastq_files(sample_dir)
    fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks
    if not fastq_files:
        handle_error("No fastq files were found.")
    else:
        tags = fastq.extract_metadata_tags(fastq_files[0])
        if tags:
            machine = tags.instrument or ""
            flowcell = tags.flow_cell or ""

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files"))
    #-------------------------
    fastq_file_size = ""
    fastq_file_list = ""
    if fastq_files:
        fastq_file_size = sum([os.path.getsize(file) for file in fastq_files])

    # Make a comma separated list of just the fastq file names without directories
    fastq_file_list = [os.path.basename(file) for file in fastq_files]
    fastq_file_list = ", ".join(fastq_file_list)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of reads and %mapped from sam file"))
    #-------------------------
    num_reads = ""
    percent_reads_mapped = ""
    file = os.path.join(sample_dir, "reads.sam")
    if verify_input_file("SAM file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            num_reads = metrics.get("numberReads", "") # reuse already fresh metrics
            percent_reads_mapped = metrics.get("percentReadsMapped", "") # reuse already fresh metrics
        if num_reads and percent_reads_mapped:
            verbose_print("Reusing previously calculated number of reads and %mapped")
        else:
            num_reads = command.run("samtools view -S -c " + file)
            num_reads = num_reads.strip()
            mapped = command.run("samtools view -S -c -F 4 " + file)
            mapped = mapped.strip()
            try:
                percent_reads_mapped = 100.0 * float(mapped) / float(num_reads)
                percent_reads_mapped = "%.2f" % percent_reads_mapped
            except ValueError:
                handle_error("Cannot calculate number of reads and %mapped.")

    #-------------------------
    # Calculate number of duplicate reads from deduped bam file
    #-------------------------
    num_dup_reads = ""
    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file"))
        file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        if verify_input_file("Deduped BAM file", file):
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics
            if num_dup_reads:
                verbose_print("Reusing previously calculated number of duplicate reads")
            else:
                num_dup_reads = command.run("samtools view -S -c -f 1024 " + file)
                num_dup_reads = num_dup_reads.strip()

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean insert size from bam file"))
    #-------------------------
    ave_insert_size = ""
    file = os.path.join(sample_dir, "reads.sorted.bam")
    if verify_input_file("BAM file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics
        if ave_insert_size:
            verbose_print("Reusing previously calculated mean insert size")
        else:
            # Extract inferred insert sizes (TLEN, column 9 of BAM file) for reads "mapped in proper pair" (2) and "first in pair" (64) = 66
            tempfile = NamedTemporaryFile(delete=False, dir=sample_dir, prefix="tmp.inserts.", mode='w')
            command.run("samtools view -f 66 " + file + " | cut -f 9 | sed 's/^-//'", tempfile.name)
            insert_count = 0
            insert_sum = 0
            with open(tempfile.name) as f:
                for line in f:
                    try:
                        insert_sum += int(line)
                        insert_count += 1
                    except ValueError:
                        pass
            os.unlink(tempfile.name)
            if insert_count > 0 and insert_sum > 0:
                ave_insert_size = float(insert_sum) / float(insert_count)
                ave_insert_size = "%.2f" % ave_insert_size
            else:
                handle_error("Cannot calculate mean insert size.")

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file"))
    #-------------------------
    ave_pileup_depth = ""
    file = os.path.join(sample_dir, "reads.all.pileup")
    if verify_input_file("Pileup file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics
        if ave_pileup_depth:
            verbose_print("Reusing previously calculated mean pileup depth")
        else:
            depth_sum = 0
            with open(file) as f:
                for line in f:
                    tokens = line.split()
                    try:
                        depth_sum += int(tokens[3])
                    except (ValueError, IndexError):
                        pass
            reference_length = 0
            for record in SeqIO.parse(reference_file_path, "fasta"):
                reference_length += len(record)
            if depth_sum > 0 and reference_length > 0:
                #print("depth_sum=%i" % depth_sum);
                #print("reference_length=%i" % reference_length)
                ave_pileup_depth = float(depth_sum) / float(reference_length)
                ave_pileup_depth = "%.2f" % ave_pileup_depth
            else:
                handle_error("Cannot calculate mean pileup depth.")

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of high confidence SNP positions from phase 1 vcf file"))
    #-------------------------
    phase1_snps = ""
    excluded_sample = ""
    file = os.path.join(sample_dir, "var.flt.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics
        if phase1_snps:
            verbose_print("Reusing previously calculated phase1 snps")
        else:
            phase1_snps = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps > max_allowed_snps:
            excluded_sample = "Excluded"
            handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps)
        phase1_snps = str(phase1_snps)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file"))
    #-------------------------
    phase1_snps_preserved = ""
    excluded_sample_preserved = ""
    file = os.path.join(sample_dir, "var.flt_preserved.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps_preserved = metrics.get("phase1SnpsPreserved", "") # reuse already fresh metrics
        if phase1_snps_preserved:
            verbose_print("Reusing previously calculated preserved phase1 snps")
        else:
            phase1_snps_preserved = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps:
            excluded_sample_preserved = "Excluded"
            handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps)
        phase1_snps_preserved = str(phase1_snps_preserved)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps = ""
    file = os.path.join(sample_dir, consensus_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps = metrics.get("snps", "") # reuse already fresh metrics
            if phase2_snps:
                verbose_print("Reusing previously calculated phase2 snps")
            else:
                phase2_snps = count_vcf_file_snps(file)
                phase2_snps = str(phase2_snps)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps_preserved = metrics.get("snpsPreserved", "") # reuse already fresh metrics
            if phase2_snps_preserved:
                verbose_print("Reusing previously calculated preserved phase2 snps")
            else:
                phase2_snps_preserved = count_vcf_file_snps(file)
                phase2_snps_preserved = str(phase2_snps_preserved)

    #------------------------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix"))
    #------------------------------------------
    missing_pos = ""
    file = os.path.join(sample_dir, consensus_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics
            if missing_pos:
                verbose_print("Reusing previously calculated missing positions")
            else:
                missing_pos = count_missing_snp_matrix_positions(file, sample_id)
                missing_pos = str(missing_pos)

    #------------------------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix"))
    #------------------------------------------
    missing_pos_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos_preserved = metrics.get("missingPosPreserved", "") # reuse already fresh metrics
            if missing_pos_preserved:
                verbose_print("Reusing previously calculated missing positions")
            else:
                missing_pos_preserved = count_missing_snp_matrix_positions(file, sample_id)
                missing_pos_preserved = str(missing_pos_preserved)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Print results"))
    #-------------------------
    with open(metrics_file_path, "w") as f:
        print("sample=" + '"' + sample_id + '"', file=f)
        print("fastqFileList=" + '"' + fastq_file_list + '"', file=f)
        print("fastqFileSize=" + str(fastq_file_size), file=f)
        print("machine=" + machine, file=f)
        print("flowcell=" + flowcell, file=f)
        print("numberReads=" + num_reads, file=f)
        print("numberDupReads=" + num_dup_reads, file=f)
        print("percentReadsMapped=" + percent_reads_mapped, file=f)
        print("aveInsertSize=" + ave_insert_size, file=f)
        print("avePileupDepth=" + ave_pileup_depth, file=f)
        print("phase1Snps=" + phase1_snps, file=f)
        print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f)
        print("snps=" + phase2_snps, file=f)
        print("snpsPreserved=" + phase2_snps_preserved, file=f)
        print("missingPos=" + missing_pos, file=f)
        print("missingPosPreserved=" + missing_pos_preserved, file=f)
        print("excludedSample=" + excluded_sample, file=f)
        print("excludedSamplePreserved=" + excluded_sample_preserved, file=f)
        print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
Пример #7
0
def run(args):
    """Run all the steps of the snp pipeline in th correct order.

    Parameters
    ----------
    args : Namespace
        referenceFile : str
            Relative or absolute path to the reference fasta file
        forceFlag : bool
            Force processing even when result files already exist and are newer than inputs
        mirror : str
            Mode to create a mirror copy of the reference directory and all the sample directories.
            Possible values: {soft, hard, copy}
        configFile : str
            Relative or absolute path to a configuration file for overriding defaults and defining
            extra parameters for the tools and scripts within the pipeline.
        jobQueueMgr : str
            Job queue manager for remote parallel job execution in an HPC environment.  Currently
            "torque" and "grid" are supported.  If not specified, the pipeline will execute locally.
        workDir : str
            Output directory for the result files.
        samplesDir : str
            Relative or absolute path to the parent directory of all the sample directories.
        samplesFile : str
            Relative or absolute path to a file listing all of the sample directories.
    """
    global log_dir
    global job_queue_mgr

    # Where are we running: grid, torque, or None (local)
    job_queue_mgr = args.jobQueueMgr

    # Erase any left-over error log environment variable from a previous run
    os.environ.pop("errorOutputFile", None) # the 2nd arg avoids an exception when not in dict

    # Handle output working directory.  Create the directory if it does not exist.
    # Any errors creating the work_dir will not be logged to the error log because
    # the error log belongs in the work_dir.
    work_dir = args.workDir
    try:
        utils.mkdir_p(work_dir)
    except OSError as exc:
        utils.fatal_error("Error: could not create the output directory %s" % work_dir)
    if not utils.is_directory_writeable(work_dir):
        utils.fatal_error("Error: output directory % is not writable." % work_dir)

    # The error log is in the main workdir
    error_output_file = os.path.join(work_dir, "error.log")
    os.environ["errorOutputFile"] = error_output_file
    # TODO: copy old error log to old logs directory, because otherwise it will be removed and lost forever
    if os.path.isfile(error_output_file):
        os.remove(error_output_file)

    # Validate reference fasta file
    reference_file_path = args.referenceFile
    if not os.path.isfile(reference_file_path):
        utils.fatal_error("Error: reference file %s does not exist." % reference_file_path)
    if os.path.getsize(reference_file_path) == 0:
        utils.fatal_error("Error: reference file %s is empty." % reference_file_path)
    reference_file_name = os.path.basename(reference_file_path)

    # Force rebuild flag is passed to all the subtask commands below
    force_flag = " -f " if args.forceFlag else " "

    # Create the logs directory with name like "logs-20170215.144253"
    run_time_stamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
    log_dir = os.path.join(work_dir, "logs-" + run_time_stamp)
    try:
        utils.mkdir_p(log_dir)
    except OSError as exc:
        utils.fatal_error("Error: could not create the logs directory %s" % log_dir)
    if not utils.is_directory_writeable(work_dir):
        utils.fatal_error("Error: logs directory % is not writable." % log_dir)

    # Handle configuration file, use the specified file, or create a default file
    if args.configFile:
        config_file_path = args.configFile
        if not os.path.isfile(config_file_path):
            utils.fatal_error("Error: configuration file %s does not exist." % config_file_path)
        if os.path.getsize(config_file_path) == 0:
            utils.fatal_error("Error: configuration file %s is empty." % config_file_path)

        shutil.copy2(config_file_path, log_dir)  # copy2 tries to preserve timestamps
        config_params = utils.read_properties(config_file_path, recognize_vars=True)
        validate_properties(config_params)
    else:
        command.run("cfsan_snp_pipeline data configurationFile " + log_dir, outfile=sys.stdout)
        config_file_path = os.path.join(log_dir, "snppipeline.conf")
        config_params = utils.read_properties(config_file_path, recognize_vars=True)

    # Validate the configured aligner choice
    snp_pipeline_aligner = config_params.get("SnpPipeline_Aligner", "").lower() or "bowtie2"
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.fatal_error("Config file error in SnpPipeline_Aligner parameter: only bowtie2 and smalt aligners are supported.")
    os.environ["SnpPipeline_Aligner"] = snp_pipeline_aligner

    # Stop the pipeline by default upon single sample errors if not configured either way
    # The environment variable is used by called processes
    stop_on_error = config_params.get("StopOnSampleError", "").lower() or "true"
    os.environ["StopOnSampleError"] = stop_on_error

    # Convert the stop_on_error flag to boolean for internal use in this function
    stop_on_error = stop_on_error == "true"

    # How many CPU cores can we use?
    max_cpu_cores = config_params.get("MaxCpuCores", None)
    if max_cpu_cores == "":
        max_cpu_cores = None
    if max_cpu_cores:
        try:
            max_cpu_cores = int(max_cpu_cores)
            if max_cpu_cores < 1:
                utils.fatal_error("Config file error in MaxCpuCores parameter: %s is less than one." % max_cpu_cores)
        except ValueError:
            utils.fatal_error("Config file error in MaxCpuCores parameter: %s is not a valid number." % max_cpu_cores)

    if job_queue_mgr is None: # workstation
        num_local_cpu_cores = psutil.cpu_count()
        max_cpu_cores = min(num_local_cpu_cores, max_cpu_cores) if max_cpu_cores else num_local_cpu_cores

    # Put the configuration parameters into the process environment variables
    os.environ["Bowtie2Build_ExtraParams"] = config_params.get("Bowtie2Build_ExtraParams", "")
    os.environ["SmaltIndex_ExtraParams"] = config_params.get("SmaltIndex_ExtraParams", "")
    os.environ["SamtoolsFaidx_ExtraParams"] = config_params.get("SamtoolsFaidx_ExtraParams", "")
    os.environ["Bowtie2Align_ExtraParams"] = config_params.get("Bowtie2Align_ExtraParams", "")
    os.environ["SmaltAlign_ExtraParams"] = config_params.get("SmaltAlign_ExtraParams", "")
    os.environ["SamtoolsSamFilter_ExtraParams"] = config_params.get("SamtoolsSamFilter_ExtraParams", "")
    os.environ["SamtoolsSort_ExtraParams"] = config_params.get("SamtoolsSort_ExtraParams", "")
    os.environ["RemoveDuplicateReads"] = config_params.get("RemoveDuplicateReads", "").lower() or "true"
    os.environ["PicardMarkDuplicates_ExtraParams"] = config_params.get("PicardMarkDuplicates_ExtraParams", "")
    os.environ["PicardJvm_ExtraParams"] = config_params.get("PicardJvm_ExtraParams", "")
    os.environ["SamtoolsMpileup_ExtraParams"] = config_params.get("SamtoolsMpileup_ExtraParams", "")
    os.environ["VarscanMpileup2snp_ExtraParams"] = config_params.get("VarscanMpileup2snp_ExtraParams", "")
    os.environ["VarscanJvm_ExtraParams"] = config_params.get("VarscanJvm_ExtraParams", "")
    os.environ["FilterRegions_ExtraParams"] = config_params.get("FilterRegions_ExtraParams", "")
    os.environ["MergeSites_ExtraParams"] = config_params.get("MergeSites_ExtraParams", "")
    os.environ["CallConsensus_ExtraParams"] = config_params.get("CallConsensus_ExtraParams", "")
    os.environ["SnpMatrix_ExtraParams"] = config_params.get("SnpMatrix_ExtraParams", "")
    os.environ["BcftoolsMerge_ExtraParams"] = config_params.get("BcftoolsMerge_ExtraParams", "")
    os.environ["SnpReference_ExtraParams"] = config_params.get("SnpReference_ExtraParams", "")
    os.environ["MergeVcfs_ExtraParams"] = config_params.get("MergeVcfs_ExtraParams", "")
    os.environ["CollectMetrics_ExtraParams"] = config_params.get("CollectMetrics_ExtraParams", "")
    os.environ["CombineMetrics_ExtraParams"] = config_params.get("CombineMetrics_ExtraParams", "")

    # Verify the dependencies are available on the path
    dependencies = ["cfsan_snp_pipeline", snp_pipeline_aligner, "samtools", "java", "tabix", "bgzip", "bcftools"]
    found_all_dependencies = True
    for executable in dependencies:
        if not utils.which(executable):
            utils.report_error(executable + " is not on the path")
            found_all_dependencies = False

    stdout = command.run("java net.sf.varscan.VarScan 2>&1")
    if "Error" in stdout:
        utils.report_error("CLASSPATH is not configured with the path to VarScan")
        found_all_dependencies = False

    if os.environ["RemoveDuplicateReads"] == "true":
        stdout = command.run("java picard.cmdline.PicardCommandLine 2>&1")
        if "Error" in stdout:
            utils.report_error("CLASSPATH is not configured with the path to Picard")
            found_all_dependencies = False

    if not found_all_dependencies:
        utils.fatal_error("Check the SNP Pipeline installation instructions here: http://snp-pipeline.readthedocs.org/en/latest/installation.html")

    # Process the sample directory command line option
    # TODO: detect broken fastq symlinks
    if args.samplesDir:
        samples_parent_dir = args.samplesDir.rstrip('/') # strip trailing slash
        if not utils.verify_non_empty_directory("Samples directory", samples_parent_dir):
            sys.exit(1)

        # verify at least one of the subdirectories contains fastq files.
        dir_sizes = get_sorted_sample_dirs_fastq_sizes(samples_parent_dir)
        dir_sizes = [(size, path) for size, path in dir_sizes if size > 0]
        if len(dir_sizes) == 0:
            utils.fatal_error("Samples directory %s does not contain subdirectories with fastq files." % samples_parent_dir)

        sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt")
        persist_sorted_sample_dirs_file(samples_parent_dir, sample_dirs_file)

    # Process the file of sample directories command line option
    # TODO: detect broken fastq symlinks
    if args.samplesFile:
        sample_dirs_file = args.samplesFile
        if not os.path.isfile(sample_dirs_file):
            utils.fatal_error("Error: the file of samples directories, %s, does not exist." % sample_dirs_file)
        if os.path.getsize(sample_dirs_file) == 0:
            utils.fatal_error("Error: the file of samples directories, %s, is empty." % sample_dirs_file)
        rewrite_cleansed_file_of_sample_dirs(sample_dirs_file, os.path.join(work_dir, "sampleDirectories.txt"))
        sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt")
        validate_file_of_sample_dirs(sample_dirs_file)

    with open(sample_dirs_file) as f:
        sample_dirs_list = f.read().splitlines()
    sample_count = len(sample_dirs_list)

    # --------------------------------------------------------
    if job_queue_mgr is None:
        progress("Step 1 - Prep work")
    else:
        print("Step 1 - Prep work")

    # --------------------------------------------------------
    # Mirror the input reference and samples if requested
    # TODO: make this a pure python solution
    if args.mirror:
        if args.mirror == "soft":
            # soft link, subsequent freshness checks use the timestamp of original file, not the soft link
            mirror_flag = " -s "
        elif args.mirror == "hard":
            # hard link, automatically preserves attributes of the original file
            mirror_flag = " -l "
        else:
            # regular copy, -p explicitly preserves attributes of the original file
            mirror_flag = " -p "

        # flush stdout to keep the unbuffered stderr in chronological order with stdout
        sys.stdout.flush()

        # Mirror/link the reference
        work_reference_dir = os.path.join(work_dir, "reference")
        utils.mkdir_p(work_reference_dir)
        src_reference_file = os.path.abspath(reference_file_path)
        cmd = "cp -v -u -f" + mirror_flag + src_reference_file + ' ' + work_reference_dir
        subprocess.check_call(cmd, shell=True)

        # since we mirrored the reference, we need to update our reference location
        reference_file_path = os.path.join(work_reference_dir, reference_file_name)

        # Mirror/link the samples
        work_samples_parent_dir = os.path.join(work_dir, "samples")
        for directory in sample_dirs_list:
            basedir = os.path.basename(directory)
            work_sample_dir = os.path.join(work_samples_parent_dir, basedir)
            utils.mkdir_p(work_sample_dir)
            src_sample_dir = os.path.abspath(directory)
            # copy without stderr message and without exit error code because the fastq or fq files might not exist
            cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fastq* " + work_sample_dir + " 2> /dev/null || true"
            subprocess.check_call(cmd, shell=True)
            cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fq* " + work_sample_dir + " 2> /dev/null || true"
            subprocess.check_call(cmd, shell=True)

        # since we mirrored the samples, we need to update our sorted list of samples
        sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt")
        persist_sorted_sample_dirs_file(work_samples_parent_dir, sample_dirs_file)

        # refresh the list of sample dirs -- now in sorted order
        with open(sample_dirs_file) as f:
            sample_dirs_list = f.read().splitlines()

    # get the *.fastq or *.fq files in each sample directory, possibly compresessed, on one line per sample, ready to feed to bowtie
    sample_full_path_names_file = os.path.join(work_dir, "sampleFullPathNames.txt")
    with open(sample_full_path_names_file, 'w') as f:
        for directory in sample_dirs_list:
            file_list = fastq.list_fastq_files(directory)
            print(' '.join(file_list), file=f)

    # Initialize the job runner
    if job_queue_mgr is None:
        runner = JobRunner("local", exception_handler=handle_exception, verbose=args.verbose >= 4)
    elif job_queue_mgr == "grid":
        strip_job_array_suffix = config_params.get("GridEngine_StripJobArraySuffix", "true").lower()
        qsub_extra_params = config_params.get("GridEngine_QsubExtraParams")
        runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4)
    else:
        strip_job_array_suffix = config_params.get("Torque_StripJobArraySuffix", "false").lower()
        qsub_extra_params = config_params.get("Torque_QsubExtraParams")
        runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4)

    progress("Step 2 - Index the reference")
    log_file = os.path.join(log_dir, "indexRef.log")
    command_line = "cfsan_snp_pipeline index_ref" + force_flag + reference_file_path
    job_id_index_ref = runner.run(command_line, "indexRef", log_file)

    progress("Step 3 - Map the sample reads to the reference")
    # Parse the user-specified aligner parameters to find the number of CPU cores requested, for example, "-p 16" or "-n 16"
    # Set the default number of CPU cores if the user did not configure a value.
    if snp_pipeline_aligner == "smalt":
        extra_params_env_var = "SmaltAlign_ExtraParams"
        threads_option = "-n"
    else:
        extra_params_env_var = "Bowtie2Align_ExtraParams"
        threads_option = "-p"

    max_processes, threads_per_process = configure_process_threads(extra_params_env_var, threads_option, 8, max_cpu_cores)

    parallel_environment = config_params.get("GridEngine_PEname", None)
    log_file = os.path.join(log_dir, "mapReads.log")
    command_line = "cfsan_snp_pipeline map_reads" + force_flag + reference_file_path + " {1} {2}"
    job_id_map_reads = runner.run_array(command_line, "mapReads", log_file, sample_full_path_names_file, max_processes=max_processes, wait_for=[job_id_index_ref], threads=threads_per_process, parallel_environment=parallel_environment)

    progress("Step 4 - Find sites with SNPs in each sample")
    if job_queue_mgr in ["grid", "torque"]:
        time.sleep(1.0 + float(sample_count) / 150) # workaround torque bug when submitting two large consecutive array jobs, potential bug for grid

    log_file = os.path.join(log_dir, "callSites.log")
    command_line = "cfsan_snp_pipeline call_sites" + force_flag + reference_file_path + " {1}"
    job_id_call_sites = runner.run_array(command_line, "callSites", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_map_reads], slot_dependency=True)

    progress("Step 5 - Filter abnormal SNP regions")
    log_file = os.path.join(log_dir, "filterRegions.log")
    extra_params = os.environ.get("FilterRegions_ExtraParams", "")
    command_line = "cfsan_snp_pipeline filter_regions" + force_flag + "-n var.flt.vcf " + sample_dirs_file + ' ' + reference_file_path + ' ' + extra_params
    job_id_filter_regions = runner.run(command_line, "filterRegions", log_file, wait_for_array=[job_id_call_sites])

    # Starting from here, there are 2 threads:
    # Thread X.1: the thread processing the original VCF files and corresponding downstream results
    # Thread X.2: the thread processing the preserved VCF files and corresponding downstream results

    progress("Step 6.1 - Merge the SNP sites across all samples into the SNP list file")
    # The mergeSites process creates the filtered list of sample directories.  It is the list of samples not having excessive snps.
    # When running on a workstation, the file exists at this point during the script execution, but on grid or torque, it has not yet been created. However,
    # we know the path to the file regardless of whether it exists yet.
    filtered_sample_dirs_file = sample_dirs_file + ".OrigVCF.filtered"
    # touch $filtered_sample_dirs_file # TODO: why was this touch here in the old run_snp_pipeline.sh script?
    log_file = os.path.join(log_dir, "mergeSites.log")
    output_file = os.path.join(work_dir, "snplist.txt")
    extra_params = os.environ.get("MergeSites_ExtraParams", "")
    command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file
    job_id_merge_sites = runner.run(command_line, "mergeSites", log_file, wait_for=[job_id_filter_regions])

    progress("Step 6.2 - Merge the SNP sites across all samples into the SNP list file")
    # Create another copy of sample directories file, for the thread processing preserved snp files.
    filtered_sample_dirs_file2 = sample_dirs_file + ".PresVCF.filtered"
    # touch $filtered_sample_dirs_file2 # TODO: why was this touch here in the old run_snp_pipeline.sh script?
    log_file = os.path.join(log_dir, "mergeSites_preserved.log")
    output_file = os.path.join(work_dir, "snplist_preserved.txt")
    extra_params = os.environ.get("MergeSites_ExtraParams", "")
    command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file2
    job_id_merge_sites2 = runner.run(command_line, "mergeSites_preserved", log_file, wait_for=[job_id_filter_regions])

    progress("Step 7.1 - Call the consensus SNPs for each sample")
    log_file = os.path.join(log_dir, "callConsensus.log")
    list_file = os.path.join(work_dir, "snplist.txt")
    output_file = "{1}/consensus.fasta"
    extra_params = os.environ.get("CallConsensus_ExtraParams", "")
    command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus.vcf {1}/reads.all.pileup"
    job_id_call_consensus = runner.run_array(command_line, "callConsensus", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites])

    progress("Step 7.2 - Call the consensus SNPs for each sample")
    log_file = os.path.join(log_dir, "callConsensus_preserved.log")
    list_file = os.path.join(work_dir, "snplist_preserved.txt")
    output_file = "{1}/consensus_preserved.fasta"
    extra_params = os.environ.get("CallConsensus_ExtraParams", "")
    command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " -e {1}/var.flt_removed.vcf --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus_preserved.vcf {1}/reads.all.pileup"
    job_id_call_consensus2 = runner.run_array(command_line, "callConsensus_preserved", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites2])

    progress("Step 8.1 - Create the SNP matrix")
    log_file = os.path.join(log_dir, "snpMatrix.log")
    output_file = os.path.join(work_dir, "snpma.fasta")
    extra_params = os.environ.get("SnpMatrix_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file
    job_id_snp_matrix = runner.run(command_line, "snpMatrix", log_file, wait_for_array=[job_id_call_consensus])

    progress("Step 8.2 - Create the SNP matrix")
    log_file = os.path.join(log_dir, "snpMatrix_preserved.log")
    output_file = os.path.join(work_dir, "snpma_preserved.fasta")
    extra_params = os.environ.get("SnpMatrix_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus_preserved.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2
    job_id_snp_matrix2 = runner.run(command_line, "snpMatrix_preserved", log_file, wait_for_array=[job_id_call_consensus2])

    progress("Step 9.1 - Create the reference sequence at SNP sites")
    log_file = os.path.join(log_dir, "snpReference.log")
    list_file = os.path.join(work_dir, "snplist.txt")
    output_file = os.path.join(work_dir, "referenceSNP.fasta")
    extra_params = os.environ.get("SnpReference_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path
    job_id_snp_reference = runner.run(command_line, "snpReference", log_file, wait_for_array=[job_id_call_consensus])

    progress("Step 9.2 - Create the reference sequence at SNP sites")
    log_file = os.path.join(log_dir, "snpReference_preserved.log")
    list_file = os.path.join(work_dir, "snplist_preserved.txt")
    output_file = os.path.join(work_dir, "referenceSNP_preserved.fasta")
    extra_params = os.environ.get("SnpReference_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path
    job_id_snp_reference2 = runner.run(command_line, "snpReference_preserved", log_file, wait_for_array=[job_id_call_consensus2])

    progress("Step 10.1 - Merge sample VCFs to create the multi-VCF file")
    if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""):
        log_file = os.path.join(log_dir, "mergeVcfs.log")
        output_file = os.path.join(work_dir, "snpma.vcf")
        extra_params = os.environ.get("MergeVcfs_ExtraParams", "")
        command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file
        job_id_merge_vcfs = runner.run(command_line, "mergeVcfs", log_file, wait_for_array=[job_id_call_consensus])
    else:
        print("Skipped per CallConsensus_ExtraParams configuration")

    progress("Step 10.2 - Merge sample VCFs to create the multi-VCF file")
    if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""):
        log_file = os.path.join(log_dir, "mergeVcfs_preserved.log")
        output_file = os.path.join(work_dir, "snpma_preserved.vcf")
        extra_params = os.environ.get("MergeVcfs_ExtraParams", "")
        command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-n consensus_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2
        job_id_merge_vcfs2 = runner.run(command_line, "mergeVcfs_preserved", log_file, wait_for_array=[job_id_call_consensus2])
    else:
        print("Skipped per CallConsensus_ExtraParams configuration")

    progress("Step 11.1 - Calculate SNP distance matrix")
    log_file = os.path.join(log_dir, "distance.log")
    input_file = os.path.join(work_dir, "snpma.fasta")
    pair_output_file = os.path.join(work_dir, "snp_distance_pairwise.tsv")
    matrix_output_file = os.path.join(work_dir, "snp_distance_matrix.tsv")
    command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file
    job_id_distance = runner.run(command_line, "distance", log_file, wait_for=[job_id_snp_matrix])

    progress("Step 11.2 - Calculate SNP distance matrix")
    log_file = os.path.join(log_dir, "distance_preserved.log")
    input_file = os.path.join(work_dir, "snpma_preserved.fasta")
    pair_output_file = os.path.join(work_dir, "snp_distance_pairwise_preserved.tsv")
    matrix_output_file = os.path.join(work_dir, "snp_distance_matrix_preserved.tsv")
    command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file
    job_id_distance2 = runner.run(command_line, "distance_preserved", log_file, wait_for=[job_id_snp_matrix2])

    progress("Step 12 - Collect metrics for each sample")
    log_file = os.path.join(log_dir, "collectMetrics.log")
    output_file = "{1}/metrics"
    extra_params = os.environ.get("CollectMetrics_ExtraParams", "")
    command_line = "cfsan_snp_pipeline collect_metrics" + force_flag + "-o " + output_file + ' ' + extra_params + " {1} " + reference_file_path
    job_id_collect_metrics = runner.run_array(command_line, "collectMetrics", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_call_consensus, job_id_call_consensus2], slot_dependency=True)

    progress("Step 13 - Combine the metrics across all samples into the metrics table")
    log_file = os.path.join(log_dir, "combineMetrics.log")
    output_file = os.path.join(work_dir, "metrics.tsv")
    extra_params = os.environ.get("CombineMetrics_ExtraParams", "")
    command_line = "cfsan_snp_pipeline combine_metrics" + force_flag + "-n metrics -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file
    combine_metrics_job_id = runner.run(command_line, "combineMetrics", log_file, wait_for_array=[job_id_collect_metrics])

    # Step 14 - Notify user of any non-fatal errors accumulated during processing
    if os.path.isfile(error_output_file) and os.path.getsize(error_output_file) > 0 and not stop_on_error:
        print("\nThere were errors processing some samples.\nSee the log file %s for a summary of errors." % error_output_file, file=sys.stderr)

    # Exit here to prevent showing the "cfsan_snp_pipeline run finished" message.  The jobs are queued, not finished yet.
    if job_queue_mgr is not None: # HPC
        sys.exit(0)
Пример #8
0
def merge_vcfs(args):
    """Merge the per-sample VCF files.

    Execute an external program (bcftools merge)) to merge the VCF files.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            samples
                sample_name_one/consensus.vcf
            snpma.vcf*

    All the input files are created outside of this function.  Before
    running this command, the vcf file for each sample must be created by the
    call_consensus.py script.

    The package documentation provides an example of preparing these files based
    on the lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        sampleDirsFile : Path to file containing a list of directories -- one per sample
        vcfFileName : File name of the vcf files which must exist in each of the sample directories
        mergedVcfFile : Path to the output merged multi-vcf file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    sample_directories_list_path = args.sampleDirsFile
    vcf_file_name = args.vcfFileName
    merged_vcf_file = args.mergedVcfFile

    utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path], error_handler="global")

    with open(sample_directories_list_path, "r") as f:
        sample_directories = [line.rstrip() for line in f]
    sample_directories = [d for d in sample_directories if d]
    vcf_files = [os.path.join(d, vcf_file_name) for d in sample_directories]

    good_vcf_files = []
    for vcf_file in vcf_files:
        bad = utils.verify_non_empty_input_files("Sample vcf file", [vcf_file], error_handler="sample", continue_possible=True)
        if not bad:
            good_vcf_files.append(vcf_file)

    if len(good_vcf_files) == 0:
        utils.global_error("There are no vcf files to merge.")

    #==========================================================================
    # Check if merge has already been done
    #==========================================================================
    needs_rebuild = utils.target_needs_rebuild(vcf_files, merged_vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Multi-VCF file is already freshly created.  Use the -f option to force a rebuild.")
        return

    #==========================================================================
    # Copy, Compress, Index, Merge
    #==========================================================================

    # If there is only one good sample, just copy the consensus VCF file to the snpma.vcf file
    if len(good_vcf_files) ==  1:
        shutil.copy(good_vcf_files[0], merged_vcf_file)
        return

    # Copy single VCF files to a common directory where the files will be edited
    verbose_print("# %s Copying VCF files to temp directory" % utils.timestamp())
    parent_of_temp_dir = os.path.dirname(merged_vcf_file)
    temp_dir = tempfile.mkdtemp(prefix="tmp.vcf.", dir=parent_of_temp_dir)
    file_copies = []
    for d in sample_directories:
        src_file = os.path.join(d, vcf_file_name)
        if src_file in good_vcf_files:
            dst_file = os.path.join(temp_dir, os.path.basename(d) + ".vcf")
            file_copies.append(dst_file)
            verbose_print("copy %s %s" % (src_file, dst_file))
            #if not os.path.isfile(dst_file) or os.stat(src_file).st_mtime > os.stat(dst_file).st_mtime:
            shutil.copy2(src_file, dst_file)

    # bgzip all the sample vcf files
    verbose_print("# %s Compressing VCF files" % utils.timestamp())
    for file in file_copies:
        verbose_print("bgzip -c %s > %s" % (file, file + ".gz"))
        command.run("bgzip -c " + file, file + ".gz")

    # Index all the zipped sample vcf file
    verbose_print("# %s Indexing VCF files" % utils.timestamp())
    for file in file_copies:
        file += ".gz"
        verbose_print("tabix -f -p vcf " + file)
        command.run("tabix -f -p vcf " + file, sys.stdout)

    # Substitute the default parameters if the user did not specify bcftools parameters
    default_params = "--merge all --info-rules NS:sum"
    bcf_tools_extra_params = os.environ.get("BcftoolsMerge_ExtraParams") or default_params

    # Merge the VCFs
    verbose_print("# %s Merging VCF files" % utils.timestamp())
    command_line = "bcftools merge -o " + merged_vcf_file + ' ' + bcf_tools_extra_params + ' ' + temp_dir + "/*.gz"
    verbose_print(command_line)
    command.run(command_line, sys.stdout)

    # Clean up
    shutil.rmtree(temp_dir)
Пример #9
0
def index_ref(args):
    """Index the reference genome.

    Execute an external program (bowtie2 or smalt) to create an index for the
    reference genome to be used during subsequent alignment.  Execute samtools
    to create the faidx index file to be used during subsequent pileups.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta         # input fasta
                referenceFile.#.bt2*        # bowtie2 output
                referenceFile.rev.#.bt2*    # bowtie2 output
                referenceFile.sma*          # smalt output
                referenceFile.smi*          # smalt output
                referenceFile.fasta.fai*    # samtools faidx output

    The input fasta file is created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    # Create index file for reference
    if snp_pipeline_aligner == "bowtie2":
        target_file = reference_base_path + ".rev.1.bt2"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Bowtie index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("bowtie2",
                                                    "bowtie2 --version")
            bowtie2_build_extra_params = os.environ.get(
                "Bowtie2Build_ExtraParams") or ""
            command_line = "bowtie2-build " + bowtie2_build_extra_params + ' ' + reference_file_path + ' ' + reference_base_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)

    elif snp_pipeline_aligner == "smalt":
        target_file = reference_base_path + ".smi"
        needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                                   target_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Smalt index %s is already freshly built.  Use the -f option to force a rebuild."
                % target_file)
        else:
            version_str = utils.extract_version_str("smalt", "smalt version")
            smalt_index_extra_params = os.environ.get(
                "SmaltIndex_ExtraParams") or ""
            command_line = "smalt index " + smalt_index_extra_params + ' ' + reference_base_path + ' ' + reference_file_path
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, sys.stdout)

    # Create the samtools fai index
    verbose_print("")
    target_file = reference_file_path + ".fai"
    needs_rebuild = utils.target_needs_rebuild([reference_file_path],
                                               target_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# SAMtools fai index %s is already freshly built.  Use the -f option to force a rebuild."
            % target_file)
    else:
        version_str = utils.extract_version_str("samtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_faidx_extra_params = os.environ.get(
            "SamtoolsFaidx_ExtraParams") or ""
        command_line = "samtools faidx " + samtools_faidx_extra_params + ' ' + reference_file_path
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.global_error_on_missing_file(target_file, "samtools faidx")
Пример #10
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*

    The reverse fastq file is optional.
    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file",
                                       fastq_files,
                                       error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# %s has already been aligned to %s.  Use the -f option to force a rebuild."
            % (sample_id, reference_id))
        return

    #==========================================================================
    # Construct the command line to execute bowtie2 or smalt
    #==========================================================================

    # The read group identifies reads from a single run and lane
    read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1,
                                                      sample_id)

    # Default to 8 cores on HPC or all cpu cores on workstation
    if os.environ.get("JOB_ID") or os.environ.get("PBS_JOBID"):
        num_cores = 8
    else:
        num_cores = psutil.cpu_count()

    num_cores_param = ""

    if snp_pipeline_aligner == "bowtie2":
        version_str = utils.extract_version_str("bowtie2", "bowtie2 --version")

        # Parse the user-specified bowtie parameters to determine if the user specified the number of CPU cores
        bowtie2_align_extra_params = os.environ.get(
            "Bowtie2Align_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(
                bowtie2_align_extra_params, "-p"):
            num_cores_param = "-p " + str(num_cores)

        # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
        # The read group tags are used by some downstream tools, like Picard and GATK.
        read_group_params = ""
        if read_group_tags:
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

        # Substitute the default parameters if the user did not specify bowtie parameters
        bowtie2_align_params = bowtie2_align_extra_params or "--reorder -q"

        # Build the command with options depending on whether the fastq files are paired
        command_line = "bowtie2 " + num_cores_param + " " + read_group_params + " " + bowtie2_align_params + " -x " + reference_base_path
        if sample_fastq_file2:
            command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
        else:
            command_line += " -U " + sample_fastq_file1

    elif snp_pipeline_aligner == "smalt":
        version_str = utils.extract_version_str("smalt", "smalt version")

        # Parse the user-specified smalt parameters to determine if the user specified the number of CPU cores
        smalt_align_extra_params = os.environ.get(
            "SmaltAlign_ExtraParams") or ""
        if not utils.detect_numeric_option_in_parameters_str(
                smalt_align_extra_params, "-n"):
            num_cores_param = "-n " + str(num_cores)

        # Substitute the default parameters if the user did not specify smalt parameters
        smalt_align_params = smalt_align_extra_params or "-O"

        # Don't use the -i 1000 option if the fastq file is unpaired
        if not sample_fastq_file2:
            smalt_align_params = re.sub(
                "-i[ ]+[0-9]+", '',
                smalt_align_extra_params)  # regex substitute

        command_line = "smalt map " + num_cores_param + " " + smalt_align_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (
            sample_fastq_file2 or "")

    #==========================================================================
    # Run the command to execute bowtie2 or smalt
    #==========================================================================
    verbose_print("# Align sequence %s to reference %s" %
                  (sample_id, reference_id))
    verbose_print("# %s %s" % (utils.timestamp(), command_line))
    verbose_print("# %s" % version_str)
    command.run(command_line, sam_file)

    #==========================================================================
    # When using smalt, assign read groups in a separate step.
    # This is already done when using bowtie2.
    #==========================================================================
    if snp_pipeline_aligner == "smalt" and read_group_tags:
        smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
        shutil.move(sam_file, smalt_sam_file)
        version_str = utils.extract_version_str(
            "Picard",
            "java  picard.cmdline.PicardCommandLine AddOrReplaceReadGroups --version 2>&1"
        )
        jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
        command_line = "java " + jvm_params + " picard.cmdline.PicardCommandLine AddOrReplaceReadGroups"
        command_line += " I=" + smalt_sam_file
        command_line += " O=" + sam_file
        command_line += " RGID=" + read_group_tags.ID
        command_line += " RGSM=" + read_group_tags.SM
        command_line += " RGLB=" + read_group_tags.LB
        command_line += " RGPL=" + read_group_tags.PL
        command_line += " RGPU=" + read_group_tags.PU
        verbose_print("")
        verbose_print("# Assign read group id %s" % (read_group_tags.ID))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
Пример #11
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sam
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir
    sam_file = os.path.join(sample_dir, "reads.sam")
    utils.verify_non_empty_input_files("Sample SAM file", [sam_file], error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")

        # Substitute the default parameters if the user did not specify samtools view parameters
        samtools_samfilter_params = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4"
        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print("# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_sort_extra_params = os.environ.get("SamtoolsSort_ExtraParams") or ""

        # Inspect the samtools version to determine how to execute samtools
        # Use the -o FILE command line option with SAMtools 1.3 and higher
        samtools_version = version_str.split()[-1] # just the number
        if samtools_version < "1.3":
            command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(sample_dir, "reads.sorted")
        else:
            command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file

        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        # Check for fresh deduped bam file; if not, remove duplicate reads
        deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        needs_rebuild = utils.target_needs_rebuild([sorted_bam_file], deduped_bam_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            if not classpath or "picard" not in classpath.lower():
                utils.global_error("Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable.")
            else:
                version_str = utils.extract_version_str("Picard", "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1")
                picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or ""
                picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or ""
                tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
                tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
                command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params
                verbose_print("# Remove duplicate reads from bam file.")
                verbose_print("# %s %s" % (utils.timestamp(), command_line))
                verbose_print("# %s" % version_str)
                command.run(command_line, sys.stdout)
                utils.sample_error_on_missing_file(deduped_bam_file, "picard MarkDuplicates")
                verbose_print("")
        pileup_input_file = deduped_bam_file
    else:
        pileup_input_file = sorted_bam_file

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild([pileup_input_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# VCF file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        classpath = os.environ.get("CLASSPATH")
        if not classpath or "varscan" not in classpath.lower():
            utils.global_error("Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable.")
        else:
            version_str = utils.extract_version_str("VarScan", "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
Пример #12
0
def collect_metrics(args):
    """Collect the quality metrics and SNP metrics for a sample.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/*.fastq.gz
                sample_name_one/reads.sam
                sample_name_one/reads.sorted.deduped.bam
                sample_name_one/reads.sorted.bam
                sample_name_one/reads.all.pileup
                sample_name_one/var.flt.vcf
                sample_name_one/var.flt_preserved.vcf
                sample_name_one/consensus.fasta
                sample_name_one/consensus_preserved.fasta
                sample_name_one/consensus.vcf
                sample_name_one/consensus_preserved.vcf
                sample_name_one/metrics*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
        consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory
        consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory
        consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory
        consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory
        maxSnps : Maximum allowed number of SNPs per sample
        metricsFile : Output file.  Relative or absolute path to the metrics file
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir
    utils.verify_non_empty_directory("Sample directory",
                                     sample_dir,
                                     error_handler="sample",
                                     continue_possible=False)

    metrics_file_path = args.metricsFile
    max_allowed_snps = args.maxSnps
    consensus_vcf_file_name = args.consensusVcfFileName
    consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName
    consensus_fasta_file_name = args.consensusFastaFileName
    consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Read existing metrics file so some metrics can be reused
    #==========================================================================
    try:
        metrics = utils.read_properties(metrics_file_path)
    except IOError:
        metrics = dict()

    #-------------------------
    verbose_print(
        "# %s %s" %
        (utils.timestamp(), "Get machine and flowcell from fastq header"))
    #-------------------------
    machine = ""
    flowcell = ""
    fastq_files = fastq.list_fastq_files(sample_dir)
    fastq_files = [f for f in fastq_files
                   if os.path.isfile(f)]  # Exclude broken symlinks
    if not fastq_files:
        handle_error("No fastq files were found.")
    else:
        tags = fastq.extract_metadata_tags(fastq_files[0])
        if tags:
            machine = tags.instrument or ""
            flowcell = tags.flow_cell or ""

    #-------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(), "Sum file sizes of paired fastq files"))
    #-------------------------
    fastq_file_size = ""
    fastq_file_list = ""
    if fastq_files:
        fastq_file_size = sum([os.path.getsize(file) for file in fastq_files])

    # Make a comma separated list of just the fastq file names without directories
    fastq_file_list = [os.path.basename(file) for file in fastq_files]
    fastq_file_list = ", ".join(fastq_file_list)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(
    ), "Calculate number of reads, %mapped, %proper pair, and ave insert size from sam file"
                               ))
    #-------------------------
    num_reads = ""
    percent_reads_mapped = ""
    percent_proper_pair = ""
    ave_insert_size = ""
    file = os.path.join(sample_dir, "reads.sam")
    if verify_input_file("SAM file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            num_reads = metrics.get("numberReads",
                                    "")  # reuse already fresh metrics
            percent_reads_mapped = metrics.get(
                "percentReadsMapped", "")  # reuse already fresh metrics
            percent_proper_pair = metrics.get(
                "percentProperPair", "")  # reuse already fresh metrics
            ave_insert_size = metrics.get("aveInsertSize",
                                          "")  # reuse already fresh metrics
        missing_any_metrics = not all([
            num_reads, percent_reads_mapped, percent_proper_pair,
            ave_insert_size
        ])
        if not missing_any_metrics:
            verbose_print(
                "Reusing previously calculated number of reads, %mapped, %proper pair, and ave insert size"
            )
        else:
            tempfile_path = os.path.join(sample_dir, "tmp.sam.stats")
            try:
                command.run("samtools stats " + file, tempfile_path)
            except subprocess.CalledProcessError:
                pass  # the error message has already been printed to stderr
            with open(tempfile_path) as f:
                for line in f:
                    lower_line = line.lower()
                    split_line = line.strip().split('\t')
                    if "raw total sequences:" in lower_line:
                        num_reads = split_line[2]
                        continue
                    if "reads mapped:" in lower_line:
                        reads_mapped = split_line[2]
                        try:
                            percent_reads_mapped = 100.0 * float(
                                reads_mapped) / float(num_reads)
                            percent_reads_mapped = "%.2f" % percent_reads_mapped
                        except ValueError:
                            percent_reads_mapped = ""
                        continue
                    if "reads properly paired:" in lower_line:
                        proper_pairs = split_line[2]
                        try:
                            percent_proper_pair = 100.0 * float(
                                proper_pairs) / float(num_reads)
                            percent_proper_pair = "%.2f" % percent_proper_pair
                        except ValueError:
                            percent_proper_pair = ""
                        continue
                    if "insert size average:" in lower_line:
                        ave_insert_size = split_line[2]
                        continue
            os.unlink(tempfile_path)
            missing_any_metrics = not all([
                num_reads, percent_reads_mapped, percent_proper_pair,
                ave_insert_size
            ])
            if missing_any_metrics:
                missing_list = []
                if not num_reads:
                    missing_list.append("number of reads")
                if not percent_reads_mapped:
                    missing_list.append("percent reads mapped")
                if not percent_proper_pair:
                    missing_list.append("percent proper pair")
                if not ave_insert_size:
                    missing_list.append("ave insert size")
                error_text = "Cannot calculate " + ", ".join(
                    missing_list) + '.'
                handle_error(error_text)

    #-------------------------
    # Calculate number of duplicate reads from deduped bam file
    #-------------------------
    num_dup_reads = ""
    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        verbose_print(
            "# %s %s" %
            (utils.timestamp(),
             "Calculate number of duplicate reads from deduped bam file"))
        file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        if verify_input_file("Deduped BAM file", file):
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                num_dup_reads = metrics.get("numberDupReads",
                                            "")  # reuse already fresh metrics
            if num_dup_reads:
                verbose_print(
                    "Reusing previously calculated number of duplicate reads")
            else:
                num_dup_reads = command.run("samtools view -S -c -f 1024 " +
                                            file)
                num_dup_reads = num_dup_reads.strip()

    #-------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(), "Calculate mean depth from pileup file"))
    #-------------------------
    ave_pileup_depth = ""
    file = os.path.join(sample_dir, "reads.all.pileup")
    if verify_input_file("Pileup file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            ave_pileup_depth = metrics.get("avePileupDepth",
                                           "")  # reuse already fresh metrics
        if ave_pileup_depth:
            verbose_print("Reusing previously calculated mean pileup depth")
        else:
            depth_sum = 0
            with open(file) as f:
                for line in f:
                    tokens = line.split()
                    try:
                        depth_sum += int(tokens[3])
                    except (ValueError, IndexError):
                        pass
            reference_length = 0
            for record in SeqIO.parse(reference_file_path, "fasta"):
                reference_length += len(record)
            if depth_sum > 0 and reference_length > 0:
                #print("depth_sum=%i" % depth_sum);
                #print("reference_length=%i" % reference_length)
                ave_pileup_depth = float(depth_sum) / float(reference_length)
                ave_pileup_depth = "%.2f" % ave_pileup_depth
            else:
                handle_error("Cannot calculate mean pileup depth.")

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(
    ), "Count number of high confidence SNP positions from phase 1 vcf file"))
    #-------------------------
    phase1_snps = ""
    excluded_sample = ""
    file = os.path.join(sample_dir, "var.flt.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps = metrics.get("phase1Snps",
                                      "")  # reuse already fresh metrics
        if phase1_snps:
            verbose_print("Reusing previously calculated phase1 snps")
        else:
            phase1_snps = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps > max_allowed_snps:
            excluded_sample = "Excluded"
            handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps)
        phase1_snps = str(phase1_snps)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(
    ), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file"
                               ))
    #-------------------------
    phase1_snps_preserved = ""
    excluded_sample_preserved = ""
    file = os.path.join(sample_dir, "var.flt_preserved.vcf")
    if verify_input_file("VCF file", file):
        # Metrics already freshly collected?
        needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path)
        if not args.forceFlag and not needs_rebuild:
            phase1_snps_preserved = metrics.get(
                "phase1SnpsPreserved", "")  # reuse already fresh metrics
        if phase1_snps_preserved:
            verbose_print(
                "Reusing previously calculated preserved phase1 snps")
        else:
            phase1_snps_preserved = count_vcf_file_snps(file)

        # Flag excessive snps
        if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps:
            excluded_sample_preserved = "Excluded"
            handle_error("Excluded: preserved exceeded %i maxsnps." %
                         max_allowed_snps)
        phase1_snps_preserved = str(phase1_snps_preserved)

    #-------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(),
                   "Count number of consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps = ""
    file = os.path.join(sample_dir, consensus_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps = metrics.get("snps",
                                          "")  # reuse already fresh metrics
            if phase2_snps:
                verbose_print("Reusing previously calculated phase2 snps")
            else:
                phase2_snps = count_vcf_file_snps(file)
                phase2_snps = str(phase2_snps)

    #-------------------------
    verbose_print(
        "# %s %s" %
        (utils.timestamp(),
         "Count number of preserved consensus snps from consensus vcf file"))
    #-------------------------
    phase2_snps_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_vcf_file_name)
    if verify_input_file("Consensus VCF file", file):
        # Omit the phase2 snp count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                phase2_snps_preserved = metrics.get(
                    "snpsPreserved", "")  # reuse already fresh metrics
            if phase2_snps_preserved:
                verbose_print(
                    "Reusing previously calculated preserved phase2 snps")
            else:
                phase2_snps_preserved = count_vcf_file_snps(file)
                phase2_snps_preserved = str(phase2_snps_preserved)

    #------------------------------------------
    verbose_print(
        "# %s %s" %
        (utils.timestamp(), "Count missing positions in the snp matrix"))
    #------------------------------------------
    missing_pos = ""
    file = os.path.join(sample_dir, consensus_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos = metrics.get("missingPos",
                                          "")  # reuse already fresh metrics
            if missing_pos:
                verbose_print(
                    "Reusing previously calculated missing positions")
            else:
                missing_pos = count_missing_snp_matrix_positions(
                    file, sample_id)
                missing_pos = str(missing_pos)

    #------------------------------------------
    verbose_print("# %s %s" %
                  (utils.timestamp(),
                   "Count missing positions in the preserved snp matrix"))
    #------------------------------------------
    missing_pos_preserved = ""
    file = os.path.join(sample_dir, consensus_preserved_fasta_file_name)
    if verify_input_file("Consensus fasta file", file):
        # Omit the phase2 gap count if the sample is excluded.
        # It will be meaningless since this sample's phase1 snps are excluded from the snplist.
        if excluded_sample_preserved != "Excluded":
            # Metrics already freshly collected?
            needs_rebuild = utils.target_needs_rebuild([file],
                                                       metrics_file_path)
            if not args.forceFlag and not needs_rebuild:
                missing_pos_preserved = metrics.get(
                    "missingPosPreserved", "")  # reuse already fresh metrics
            if missing_pos_preserved:
                verbose_print(
                    "Reusing previously calculated missing positions")
            else:
                missing_pos_preserved = count_missing_snp_matrix_positions(
                    file, sample_id)
                missing_pos_preserved = str(missing_pos_preserved)

    #-------------------------
    verbose_print("# %s %s" % (utils.timestamp(), "Print results"))
    #-------------------------
    with open(metrics_file_path, "w") as f:
        print("sample=" + '"' + sample_id + '"', file=f)
        print("fastqFileList=" + '"' + fastq_file_list + '"', file=f)
        print("fastqFileSize=" + str(fastq_file_size), file=f)
        print("machine=" + machine, file=f)
        print("flowcell=" + flowcell, file=f)
        print("numberReads=" + num_reads, file=f)
        print("numberDupReads=" + num_dup_reads, file=f)
        print("percentReadsMapped=" + percent_reads_mapped, file=f)
        print("percentProperPair=" + percent_proper_pair, file=f)
        print("aveInsertSize=" + ave_insert_size, file=f)
        print("avePileupDepth=" + ave_pileup_depth, file=f)
        print("phase1Snps=" + phase1_snps, file=f)
        print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f)
        print("snps=" + phase2_snps, file=f)
        print("snpsPreserved=" + phase2_snps_preserved, file=f)
        print("missingPos=" + missing_pos, file=f)
        print("missingPosPreserved=" + missing_pos_preserved, file=f)
        print("excludedSample=" + excluded_sample, file=f)
        print("excludedSamplePreserved=" + excluded_sample_preserved, file=f)
        print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
Пример #13
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sorted.deduped.indelrealigned.bam
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    sample_dir = args.sampleDir

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true"
    enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true"

    input_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    input_bam_file = utils.add_file_suffix(input_bam_file, ".deduped", enable=remove_duplicate_reads)
    input_bam_file = utils.add_file_suffix(input_bam_file, ".indelrealigned", enable=enable_local_realignment)

    utils.verify_non_empty_input_files("Sample BAM file", [input_bam_file], error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild([input_bam_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get("SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + input_bam_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# VCF file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH")
        if not jar_file_path:
            utils.global_error("Error: cannot execute VarScan. Define the path to VarScan.jar in the CLASSPATH environment variable.")
        else:
            version_str = utils.extract_version_str("VarScan", "java -jar " + jar_file_path + " 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2")
            varscan_jvm_extra_params = os.environ.get("VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get("VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " -jar " + jar_file_path + " mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError", "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient", "VarScan")
Пример #14
0
def call_sites(args):
    """Find the sites with SNPs in a sample.

    The sample alignment is sorted, duplicate reads are removed, a pileup is generated, and
    snps are called.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/reads.sam
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.all.pileup*
                sample_name_one/var.flt.vcf*

    The input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleDir : Relative or absolute directory of the sample
    """
    utils.print_log_header(classpath=True)
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    sample_dir = args.sampleDir
    sam_file = os.path.join(sample_dir, "reads.sam")
    utils.verify_non_empty_input_files("Sample SAM file", [sam_file],
                                       error_handler="sample")

    sample_id = utils.sample_id_from_dir(sample_dir)

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")

        # Substitute the default parameters if the user did not specify samtools view parameters
        samtools_samfilter_params = os.environ.get(
            "SamtoolsSamFilter_ExtraParams") or "-F 4"
        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print(
            "# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file],
                                               sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_sort_extra_params = os.environ.get(
            "SamtoolsSort_ExtraParams") or ""

        # Inspect the samtools version to determine how to execute samtools
        # Use the -o FILE command line option with SAMtools 1.3 and higher
        samtools_version = version_str.split()[-1]  # just the number
        if samtools_version < "1.3":
            command_line = "samtools sort " + samtools_sort_extra_params + ' ' + unsorted_bam_file + ' ' + os.path.join(
                sample_dir, "reads.sorted")
        else:
            command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file

        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true"
    remove_duplicate_reads = remove_duplicate_reads.lower()
    if remove_duplicate_reads == "true":
        # Check for fresh deduped bam file; if not, remove duplicate reads
        deduped_bam_file = os.path.join(sample_dir, "reads.sorted.deduped.bam")
        needs_rebuild = utils.target_needs_rebuild([sorted_bam_file],
                                                   deduped_bam_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            if not classpath or "picard" not in classpath.lower():
                utils.global_error(
                    "Error: cannot execute Picard. Define the path to Picard in the CLASSPATH environment variable."
                )
            else:
                version_str = utils.extract_version_str(
                    "Picard",
                    "java picard.cmdline.PicardCommandLine MarkDuplicates --version 2>&1"
                )
                picard_jvm_extra_params = os.environ.get(
                    "PicardJvm_ExtraParams") or ""
                picard_mark_duplicates_extra_params = os.environ.get(
                    "PicardMarkDuplicates_ExtraParams") or ""
                tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
                tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
                command_line = "java " + picard_jvm_extra_params + ' ' + "picard.cmdline.PicardCommandLine MarkDuplicates INPUT=" + sorted_bam_file + " OUTPUT=" + deduped_bam_file + " METRICS_FILE=" + os.path.join(
                    sample_dir, "duplicate_reads_metrics.txt"
                ) + tmp_option + ' ' + picard_mark_duplicates_extra_params
                verbose_print("# Remove duplicate reads from bam file.")
                verbose_print("# %s %s" % (utils.timestamp(), command_line))
                verbose_print("# %s" % version_str)
                command.run(command_line, sys.stdout)
                utils.sample_error_on_missing_file(deduped_bam_file,
                                                   "picard MarkDuplicates")
                verbose_print("")
        pileup_input_file = deduped_bam_file
    else:
        pileup_input_file = sorted_bam_file

    #==========================================================================
    # Create the pileup file
    #==========================================================================

    # Check for fresh pileup; if not, create it
    pileup_file = os.path.join(sample_dir, "reads.all.pileup")
    needs_rebuild = utils.target_needs_rebuild(
        [pileup_input_file, reference_file_path], pileup_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Pileup file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_mpileup_extra_params = os.environ.get(
            "SamtoolsMpileup_ExtraParams") or ""
        command_line = "samtools mpileup " + samtools_mpileup_extra_params + " -f " + reference_file_path + ' ' + pileup_input_file
        verbose_print("# Create pileup from bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, pileup_file)
        utils.sample_error_on_missing_file(pileup_file, "samtools mpileup")
        verbose_print("")

    #==========================================================================
    # Find the sites with SNPs
    #==========================================================================

    # Check for fresh unfiltered vcf; if not, create it
    vcf_file = os.path.join(sample_dir, "var.flt.vcf")
    needs_rebuild = utils.target_needs_rebuild([pileup_file], vcf_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# VCF file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        classpath = os.environ.get("CLASSPATH")
        if not classpath or "varscan" not in classpath.lower():
            utils.global_error(
                "Error: cannot execute VarScan. Define the path to VarScan in the CLASSPATH environment variable."
            )
        else:
            version_str = utils.extract_version_str(
                "VarScan",
                "java net.sf.varscan.VarScan 2>&1 > /dev/null | head -n 1 | cut -d ' ' -f 2"
            )
            varscan_jvm_extra_params = os.environ.get(
                "VarscanJvm_ExtraParams") or ""
            varscan_mpileup2snp_extra_params = os.environ.get(
                "VarscanMpileup2snp_ExtraParams") or ""
            command_line = "java " + varscan_jvm_extra_params + " net.sf.varscan.VarScan mpileup2snp " + pileup_file + " --output-vcf 1 " + varscan_mpileup2snp_extra_params
            verbose_print("# Create vcf file")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % version_str)
            command.run(command_line, vcf_file)
            utils.sample_error_on_missing_file(vcf_file, "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "OutOfMemoryError",
                                                "VarScan")
            utils.sample_error_on_file_contains(vcf_file, "Insufficient",
                                                "VarScan")
Пример #15
0
def run(args):
    """Run all the steps of the snp pipeline in th correct order.

    Parameters
    ----------
    args : Namespace
        referenceFile : str
            Relative or absolute path to the reference fasta file
        forceFlag : bool
            Force processing even when result files already exist and are newer than inputs
        mirror : str
            Mode to create a mirror copy of the reference directory and all the sample directories.
            Possible values: {soft, hard, copy}
        configFile : str
            Relative or absolute path to a configuration file for overriding defaults and defining
            extra parameters for the tools and scripts within the pipeline.
        jobQueueMgr : str
            Job queue manager for remote parallel job execution in an HPC environment.  Currently
            "torque" and "grid" are supported.  If not specified, the pipeline will execute locally.
        workDir : str
            Output directory for the result files.
        samplesDir : str
            Relative or absolute path to the parent directory of all the sample directories.
        samplesFile : str
            Relative or absolute path to a file listing all of the sample directories.
       purge : bool
            Purge the intermediate output files when the pipeline completes successfully.
    """
    global log_dir
    global job_queue_mgr

    start_time = time.time()
    # Where are we running: grid, torque, or None (local)
    job_queue_mgr = args.jobQueueMgr

    # Erase any left-over error log environment variable from a previous run
    os.environ.pop("errorOutputFile",
                   None)  # the 2nd arg avoids an exception when not in dict

    # Handle output working directory.  Create the directory if it does not exist.
    # Any errors creating the work_dir will not be logged to the error log because
    # the error log belongs in the work_dir.
    work_dir = args.workDir
    try:
        utils.mkdir_p(work_dir)
    except OSError as exc:
        utils.fatal_error("Error: could not create the output directory %s" %
                          work_dir)
    if not utils.is_directory_writeable(work_dir):
        utils.fatal_error("Error: output directory % is not writable." %
                          work_dir)

    # The error log is in the main workdir
    error_output_file = os.path.join(work_dir, "error.log")
    os.environ["errorOutputFile"] = error_output_file
    # TODO: copy old error log to old logs directory, because otherwise it will be removed and lost forever
    if os.path.isfile(error_output_file):
        os.remove(error_output_file)

    # Validate reference fasta file
    reference_file_path = args.referenceFile
    if not os.path.isfile(reference_file_path):
        utils.fatal_error("Error: reference file %s does not exist." %
                          reference_file_path)
    if os.path.getsize(reference_file_path) == 0:
        utils.fatal_error("Error: reference file %s is empty." %
                          reference_file_path)
    reference_file_name = os.path.basename(reference_file_path)

    # Force rebuild flag is passed to all the subtask commands below
    force_flag = " -f " if args.forceFlag else " "

    # Create the logs directory with name like "logs-20170215.144253"
    run_time_stamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
    log_dir = os.path.join(work_dir, "logs-" + run_time_stamp)
    try:
        utils.mkdir_p(log_dir)
    except OSError as exc:
        utils.fatal_error("Error: could not create the logs directory %s" %
                          log_dir)
    if not utils.is_directory_writeable(work_dir):
        utils.fatal_error("Error: logs directory % is not writable." % log_dir)

    # Handle configuration file, use the specified file, or create a default file
    if args.configFile:
        config_file_path = args.configFile
        if not os.path.isfile(config_file_path):
            utils.fatal_error("Error: configuration file %s does not exist." %
                              config_file_path)
        if os.path.getsize(config_file_path) == 0:
            utils.fatal_error("Error: configuration file %s is empty." %
                              config_file_path)

        shutil.copy2(config_file_path,
                     log_dir)  # copy2 tries to preserve timestamps
        config_params = utils.read_properties(config_file_path,
                                              recognize_vars=True)
        validate_properties(config_params)
    else:
        command.run("cfsan_snp_pipeline data configurationFile " + log_dir,
                    outfile=sys.stdout)
        config_file_path = os.path.join(log_dir, "snppipeline.conf")
        config_params = utils.read_properties(config_file_path,
                                              recognize_vars=True)

    # Validate the configured aligner choice
    snp_pipeline_aligner = config_params.get("SnpPipeline_Aligner",
                                             "").lower() or "bowtie2"
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.fatal_error(
            "Config file error in SnpPipeline_Aligner parameter: only bowtie2 and smalt aligners are supported."
        )
    os.environ["SnpPipeline_Aligner"] = snp_pipeline_aligner

    # Stop the pipeline by default upon single sample errors if not configured either way
    # The environment variable is used by called processes
    stop_on_error = config_params.get("StopOnSampleError",
                                      "").lower() or "true"
    os.environ["StopOnSampleError"] = stop_on_error

    # Convert the stop_on_error flag to boolean for internal use in this function
    stop_on_error = stop_on_error == "true"

    # How many CPU cores can we use?
    max_cpu_cores = config_params.get("MaxCpuCores", None)
    if max_cpu_cores == "":
        max_cpu_cores = None
    if max_cpu_cores:
        try:
            max_cpu_cores = int(max_cpu_cores)
            if max_cpu_cores < 1:
                utils.fatal_error(
                    "Config file error in MaxCpuCores parameter: %s is less than one."
                    % max_cpu_cores)
        except ValueError:
            utils.fatal_error(
                "Config file error in MaxCpuCores parameter: %s is not a valid number."
                % max_cpu_cores)

    if job_queue_mgr is None:  # workstation
        num_local_cpu_cores = psutil.cpu_count()
        max_cpu_cores = min(
            num_local_cpu_cores,
            max_cpu_cores) if max_cpu_cores else num_local_cpu_cores

    # How many CPU cores per process?
    if job_queue_mgr is None:  # workstation
        cpu_cores_per_process = config_params.get(
            "CpuCoresPerProcessOnWorkstation", None)
        if cpu_cores_per_process:
            try:
                cpu_cores_per_process = int(cpu_cores_per_process)
                if cpu_cores_per_process < 1:
                    utils.fatal_error(
                        "Config file error in CpuCoresPerProcessOnWorkstation parameter: %s is less than one."
                        % cpu_cores_per_process)
            except ValueError:
                utils.fatal_error(
                    "Config file error in CpuCoresPerProcessOnWorkstation parameter: %s is not a valid number."
                    % cpu_cores_per_process)
        else:
            cpu_cores_per_process = min(num_local_cpu_cores, max_cpu_cores)
    else:  # HPC
        cpu_cores_per_process = config_params.get("CpuCoresPerProcessOnHPC",
                                                  None)
        if not cpu_cores_per_process:
            utils.fatal_error(
                "Config file error. CpuCoresPerProcessOnHPC parameter must be set to a value."
            )
        else:
            try:
                cpu_cores_per_process = int(cpu_cores_per_process)
                if cpu_cores_per_process < 1:
                    utils.fatal_error(
                        "Config file error in CpuCoresPerProcessOnHPC parameter: %s is less than one."
                        % cpu_cores_per_process)
            except ValueError:
                utils.fatal_error(
                    "Config file error in CpuCoresPerProcessOnHPC parameter: %s is not a valid number."
                    % cpu_cores_per_process)

    # Put the configuration parameters into the process environment variables
    os.environ["Bowtie2Build_ExtraParams"] = config_params.get(
        "Bowtie2Build_ExtraParams", "")
    os.environ["SmaltIndex_ExtraParams"] = config_params.get(
        "SmaltIndex_ExtraParams", "")
    os.environ["CreateSequenceDictionary_ExtraParams"] = config_params.get(
        "CreateSequenceDictionary_ExtraParams", "")
    os.environ["SamtoolsFaidx_ExtraParams"] = config_params.get(
        "SamtoolsFaidx_ExtraParams", "")
    os.environ["Bowtie2Align_ExtraParams"] = config_params.get(
        "Bowtie2Align_ExtraParams", "")
    os.environ["SmaltAlign_ExtraParams"] = config_params.get(
        "SmaltAlign_ExtraParams", "")
    os.environ["SamtoolsSamFilter_ExtraParams"] = config_params.get(
        "SamtoolsSamFilter_ExtraParams", "")
    os.environ["SamtoolsSort_ExtraParams"] = config_params.get(
        "SamtoolsSort_ExtraParams", "")
    os.environ["SamtoolsIndex_ExtraParams"] = config_params.get(
        "SamtoolsIndex_ExtraParams", "")
    os.environ["RemoveDuplicateReads"] = config_params.get(
        "RemoveDuplicateReads", "").lower() or "true"
    os.environ["PicardJvm_ExtraParams"] = config_params.get(
        "PicardJvm_ExtraParams", "")
    os.environ["PicardMarkDuplicates_ExtraParams"] = config_params.get(
        "PicardMarkDuplicates_ExtraParams", "")
    os.environ["EnableLocalRealignment"] = config_params.get(
        "EnableLocalRealignment", "").lower() or "true"
    os.environ["GatkJvm_ExtraParams"] = config_params.get(
        "GatkJvm_ExtraParams", "")
    os.environ["RealignerTargetCreator_ExtraParams"] = config_params.get(
        "RealignerTargetCreator_ExtraParams", "")
    os.environ["IndelRealigner_ExtraParams"] = config_params.get(
        "IndelRealigner_ExtraParams", "")
    os.environ["SamtoolsMpileup_ExtraParams"] = config_params.get(
        "SamtoolsMpileup_ExtraParams", "")
    os.environ["VarscanMpileup2snp_ExtraParams"] = config_params.get(
        "VarscanMpileup2snp_ExtraParams", "")
    os.environ["VarscanJvm_ExtraParams"] = config_params.get(
        "VarscanJvm_ExtraParams", "")
    os.environ["FilterRegions_ExtraParams"] = config_params.get(
        "FilterRegions_ExtraParams", "")
    os.environ["MergeSites_ExtraParams"] = config_params.get(
        "MergeSites_ExtraParams", "")
    os.environ["CallConsensus_ExtraParams"] = config_params.get(
        "CallConsensus_ExtraParams", "")
    os.environ["SnpMatrix_ExtraParams"] = config_params.get(
        "SnpMatrix_ExtraParams", "")
    os.environ["BcftoolsMerge_ExtraParams"] = config_params.get(
        "BcftoolsMerge_ExtraParams", "")
    os.environ["SnpReference_ExtraParams"] = config_params.get(
        "SnpReference_ExtraParams", "")
    os.environ["MergeVcfs_ExtraParams"] = config_params.get(
        "MergeVcfs_ExtraParams", "")
    os.environ["CollectMetrics_ExtraParams"] = config_params.get(
        "CollectMetrics_ExtraParams", "")
    os.environ["CombineMetrics_ExtraParams"] = config_params.get(
        "CombineMetrics_ExtraParams", "")

    # Verify the dependencies are available on the path
    print("Checking dependencies...")

    dependencies = [
        "cfsan_snp_pipeline", snp_pipeline_aligner, "java", "tabix", "bgzip",
        "bcftools"
    ]
    found_all_dependencies = True
    for executable in dependencies:
        if not utils.which(executable):
            utils.report_error(executable + " is not on the path")
            found_all_dependencies = False

    if not utils.which("samtools"):
        utils.report_error("samtools is not on the path")
        found_all_dependencies = False
    else:
        version_str = utils.extract_version_str("SAMtools",
                                                "samtools 2>&1 > /dev/null")
        samtools_version = version_str.split()[-1]  # just the number
        if samtools_version < "1.4":
            utils.report_error(
                "The installed %s is not supported.  Version 1.4 or higher is required."
                % version_str)
            found_all_dependencies = False

    jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH")
    if jar_file_path:
        stdout = command.run("java -jar " + jar_file_path + " 2>&1")
    if not jar_file_path or "error" in stdout.lower():
        utils.report_error(
            "CLASSPATH is not configured with the path to VarScan.jar")
        found_all_dependencies = False

    picard_required = os.environ[
        "RemoveDuplicateReads"] == "true" or os.environ[
            "EnableLocalRealignment"] == "true"
    if picard_required:
        jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH")
        if not jar_file_path:
            utils.report_error(
                "CLASSPATH is not configured with the path to picard.jar")
            found_all_dependencies = False
        else:
            stdout = command.run("java -jar " + jar_file_path + " 2>&1")
            if stdout.lower().startswith("error"):
                utils.report_error(stdout)
                found_all_dependencies = False

    gatk_required = os.environ["EnableLocalRealignment"] == "true"
    if gatk_required:
        jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK",
                                                     "CLASSPATH")
        if not jar_file_path:
            utils.report_error(
                "CLASSPATH is not configured with the path to GenomeAnalysisTK.jar"
            )
            found_all_dependencies = False
        else:
            stdout = command.run("java -jar " + jar_file_path +
                                 " --version 2>&1")
            if stdout.lower().startswith("error"):
                utils.report_error(stdout)
                found_all_dependencies = False
            else:
                stdout = command.run("java -jar " + jar_file_path +
                                     " -T IndelRealigner --version 2>&1")
                if "not a valid command" in stdout.lower(
                ) or "indelrealigner is no longer included" in stdout.lower():
                    utils.report_error(
                        "The installed GATK version does not support indel realignment.  Try installing an older release prior to GATK v4."
                    )
                    found_all_dependencies = False
                elif "user error has occurred" in stdout.lower():
                    utils.report_error(stdout)
                    found_all_dependencies = False

    if not found_all_dependencies:
        utils.fatal_error(
            "Check the SNP Pipeline installation instructions here: http://snp-pipeline.readthedocs.org/en/latest/installation.html"
        )
    else:
        print("OK")

    # Process the sample directory command line option
    # TODO: detect broken fastq symlinks
    if args.samplesDir:
        samples_parent_dir = args.samplesDir.rstrip(
            '/')  # strip trailing slash
        if not utils.verify_non_empty_directory("Samples directory",
                                                samples_parent_dir):
            sys.exit(1)

        # verify at least one of the subdirectories contains fastq files.
        dir_sizes = get_sorted_sample_dirs_fastq_sizes(samples_parent_dir)
        dir_sizes = [(size, path) for size, path in dir_sizes if size > 0]
        if len(dir_sizes) == 0:
            utils.fatal_error(
                "Samples directory %s does not contain subdirectories with fastq files."
                % samples_parent_dir)

        sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt")
        persist_sorted_sample_dirs_file(samples_parent_dir, sample_dirs_file)

    # Process the file of sample directories command line option
    # TODO: detect broken fastq symlinks
    if args.samplesFile:
        sample_dirs_file = args.samplesFile
        if not os.path.isfile(sample_dirs_file):
            utils.fatal_error(
                "Error: the file of samples directories, %s, does not exist." %
                sample_dirs_file)
        if os.path.getsize(sample_dirs_file) == 0:
            utils.fatal_error(
                "Error: the file of samples directories, %s, is empty." %
                sample_dirs_file)
        rewrite_cleansed_file_of_sample_dirs(
            sample_dirs_file, os.path.join(work_dir, "sampleDirectories.txt"))
        sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt")
        validate_file_of_sample_dirs(sample_dirs_file)

    with open(sample_dirs_file) as f:
        sample_dirs_list = f.read().splitlines()
    sample_count = len(sample_dirs_list)

    # --------------------------------------------------------
    if job_queue_mgr is None:
        progress("Step 1 - Prep work")
    else:
        print("Step 1 - Prep work")

    # --------------------------------------------------------
    # Mirror the input reference and samples if requested
    # TODO: make this a pure python solution
    if args.mirror:
        if args.mirror == "soft":
            # soft link, subsequent freshness checks use the timestamp of original file, not the soft link
            mirror_flag = " -s "
        elif args.mirror == "hard":
            # hard link, automatically preserves attributes of the original file
            mirror_flag = " -l "
        else:
            # regular copy, -p explicitly preserves attributes of the original file
            mirror_flag = " -p "

        # flush stdout to keep the unbuffered stderr in chronological order with stdout
        sys.stdout.flush()

        # Mirror/link the reference
        work_reference_dir = os.path.join(work_dir, "reference")
        utils.mkdir_p(work_reference_dir)
        src_reference_file = os.path.abspath(reference_file_path)
        cmd = "cp -v -u -f" + mirror_flag + src_reference_file + ' ' + work_reference_dir
        subprocess.check_call(cmd, shell=True)

        # since we mirrored the reference, we need to update our reference location
        reference_file_path = os.path.join(work_reference_dir,
                                           reference_file_name)

        # Mirror/link the samples
        work_samples_parent_dir = os.path.join(work_dir, "samples")
        for directory in sample_dirs_list:
            basedir = os.path.basename(directory)
            work_sample_dir = os.path.join(work_samples_parent_dir, basedir)
            utils.mkdir_p(work_sample_dir)
            src_sample_dir = os.path.abspath(directory)
            # copy without stderr message and without exit error code because the fastq or fq files might not exist
            cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fastq* " + work_sample_dir + " 2> /dev/null || true"
            subprocess.check_call(cmd, shell=True)
            cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fq* " + work_sample_dir + " 2> /dev/null || true"
            subprocess.check_call(cmd, shell=True)

        # since we mirrored the samples, we need to update our sorted list of samples
        sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt")
        persist_sorted_sample_dirs_file(work_samples_parent_dir,
                                        sample_dirs_file)

        # refresh the list of sample dirs -- now in sorted order
        with open(sample_dirs_file) as f:
            sample_dirs_list = f.read().splitlines()

    # get the *.fastq or *.fq files in each sample directory, possibly compresessed, on one line per sample, ready to feed to bowtie
    sample_full_path_names_file = os.path.join(work_dir,
                                               "sampleFullPathNames.txt")
    with open(sample_full_path_names_file, 'w') as f:
        for directory in sample_dirs_list:
            file_list = fastq.list_fastq_files(directory)
            print(' '.join(file_list), file=f)

    # Initialize the job runner
    if job_queue_mgr is None:
        runner = JobRunner("local",
                           exception_handler=handle_exception,
                           verbose=args.verbose >= 4)
    elif job_queue_mgr == "grid":
        strip_job_array_suffix = config_params.get(
            "GridEngine_StripJobArraySuffix", "true").lower()
        qsub_extra_params = config_params.get("GridEngine_QsubExtraParams")
        runner = JobRunner(job_queue_mgr,
                           strip_job_array_suffix == "true",
                           qsub_extra_params=qsub_extra_params,
                           verbose=args.verbose >= 4)
    else:
        strip_job_array_suffix = config_params.get(
            "Torque_StripJobArraySuffix", "false").lower()
        qsub_extra_params = config_params.get("Torque_QsubExtraParams")
        runner = JobRunner(job_queue_mgr,
                           strip_job_array_suffix == "true",
                           qsub_extra_params=qsub_extra_params,
                           verbose=args.verbose >= 4)

    progress("Step 2 - Index the reference")
    log_file = os.path.join(log_dir, "indexRef.log")
    command_line = "cfsan_snp_pipeline index_ref" + force_flag + reference_file_path
    job_id_index_ref = runner.run(command_line, "indexRef", log_file)

    progress("Step 3 - Map the sample reads to the reference")
    # Parse the user-specified aligner parameters to find the number of CPU cores requested, for example, "-p 16" or "-n 16"
    # Set the default number of CPU cores if the user did not configure a value.
    if snp_pipeline_aligner == "smalt":
        extra_params_env_var = "SmaltAlign_ExtraParams"
        threads_option = "-n"
    else:
        extra_params_env_var = "Bowtie2Align_ExtraParams"
        threads_option = "-p"

    aligner_max_processes, aligner_threads_per_process = utils.configure_process_threads(
        extra_params_env_var, threads_option, cpu_cores_per_process,
        max_cpu_cores)
    samfilter_max_processes, samfilter_threads_per_process = utils.configure_process_threads(
        "SamtoolsSamFilter_ExtraParams", ["-@", "--threads"],
        cpu_cores_per_process, max_cpu_cores)
    samsort_max_processes, samsort_threads_per_process = utils.configure_process_threads(
        "SamtoolsSort_ExtraParams", ["-@", "--threads"], cpu_cores_per_process,
        max_cpu_cores)
    samindex_max_processes, samindex_threads_per_process = utils.configure_process_threads(
        "SamtoolsIndex_ExtraParams", ["-@"], cpu_cores_per_process,
        max_cpu_cores)
    realigner_max_processes, realigner_threads_per_process = utils.configure_process_threads(
        "RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"],
        cpu_cores_per_process, max_cpu_cores)

    # There are multiple processes within map_reads, each with multiple threads.
    # The CPU allocation must be enough for the process needing the largest number of threads.
    max_processes_list = [
        aligner_max_processes, samfilter_max_processes, samsort_max_processes,
        samindex_max_processes, realigner_max_processes
    ]
    if all([i is None for i in max_processes_list]):
        max_processes = None
    else:
        max_processes = min([i for i in max_processes_list if i is not None])
    threads_per_process = max(aligner_threads_per_process,
                              samfilter_threads_per_process,
                              samsort_threads_per_process,
                              samindex_threads_per_process,
                              realigner_threads_per_process)

    parallel_environment = config_params.get("GridEngine_PEname", None)
    log_file = os.path.join(log_dir, "mapReads.log")
    command_line = "cfsan_snp_pipeline map_reads --threads " + str(
        threads_per_process) + force_flag + reference_file_path + " {1} {2}"
    job_id_map_reads = runner.run_array(
        command_line,
        "mapReads",
        log_file,
        sample_full_path_names_file,
        max_processes=max_processes,
        wait_for=[job_id_index_ref],
        threads=threads_per_process,
        parallel_environment=parallel_environment)

    progress("Step 4 - Find sites with SNPs in each sample")
    if job_queue_mgr in ["grid", "torque"]:
        time.sleep(
            1.0 + float(sample_count) / 150
        )  # workaround torque bug when submitting two large consecutive array jobs, potential bug for grid

    log_file = os.path.join(log_dir, "callSites.log")
    command_line = "cfsan_snp_pipeline call_sites" + force_flag + reference_file_path + " {1}"
    job_id_call_sites = runner.run_array(command_line,
                                         "callSites",
                                         log_file,
                                         sample_dirs_file,
                                         max_processes=max_cpu_cores,
                                         wait_for_array=[job_id_map_reads],
                                         slot_dependency=True)

    progress("Step 5 - Filter abnormal SNP regions")
    log_file = os.path.join(log_dir, "filterRegions.log")
    extra_params = os.environ.get("FilterRegions_ExtraParams", "")
    command_line = "cfsan_snp_pipeline filter_regions" + force_flag + "-n var.flt.vcf " + sample_dirs_file + ' ' + reference_file_path + ' ' + extra_params
    job_id_filter_regions = runner.run(command_line,
                                       "filterRegions",
                                       log_file,
                                       wait_for_array=[job_id_call_sites])

    # Starting from here, there are 2 threads:
    # Thread X.1: the thread processing the original VCF files and corresponding downstream results
    # Thread X.2: the thread processing the preserved VCF files and corresponding downstream results

    progress(
        "Step 6.1 - Merge the SNP sites across all samples into the SNP list file"
    )
    # The mergeSites process creates the filtered list of sample directories.  It is the list of samples not having excessive snps.
    # When running on a workstation, the file exists at this point during the script execution, but on grid or torque, it has not yet been created. However,
    # we know the path to the file regardless of whether it exists yet.
    filtered_sample_dirs_file = sample_dirs_file + ".OrigVCF.filtered"
    # touch $filtered_sample_dirs_file # TODO: why was this touch here in the old run_snp_pipeline.sh script?
    log_file = os.path.join(log_dir, "mergeSites.log")
    output_file = os.path.join(work_dir, "snplist.txt")
    extra_params = os.environ.get("MergeSites_ExtraParams", "")
    command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file
    job_id_merge_sites = runner.run(command_line,
                                    "mergeSites",
                                    log_file,
                                    wait_for=[job_id_filter_regions])

    progress(
        "Step 6.2 - Merge the SNP sites across all samples into the SNP list file"
    )
    # Create another copy of sample directories file, for the thread processing preserved snp files.
    filtered_sample_dirs_file2 = sample_dirs_file + ".PresVCF.filtered"
    # touch $filtered_sample_dirs_file2 # TODO: why was this touch here in the old run_snp_pipeline.sh script?
    log_file = os.path.join(log_dir, "mergeSites_preserved.log")
    output_file = os.path.join(work_dir, "snplist_preserved.txt")
    extra_params = os.environ.get("MergeSites_ExtraParams", "")
    command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file2
    job_id_merge_sites2 = runner.run(command_line,
                                     "mergeSites_preserved",
                                     log_file,
                                     wait_for=[job_id_filter_regions])

    progress("Step 7.1 - Call the consensus SNPs for each sample")
    log_file = os.path.join(log_dir, "callConsensus.log")
    list_file = os.path.join(work_dir, "snplist.txt")
    output_file = "{1}/consensus.fasta"
    extra_params = os.environ.get("CallConsensus_ExtraParams", "")
    command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus.vcf {1}/reads.all.pileup"
    job_id_call_consensus = runner.run_array(command_line,
                                             "callConsensus",
                                             log_file,
                                             sample_dirs_file,
                                             max_processes=max_cpu_cores,
                                             wait_for=[job_id_merge_sites])

    progress("Step 7.2 - Call the consensus SNPs for each sample")
    log_file = os.path.join(log_dir, "callConsensus_preserved.log")
    list_file = os.path.join(work_dir, "snplist_preserved.txt")
    output_file = "{1}/consensus_preserved.fasta"
    extra_params = os.environ.get("CallConsensus_ExtraParams", "")
    command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " -e {1}/var.flt_removed.vcf --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus_preserved.vcf {1}/reads.all.pileup"
    job_id_call_consensus2 = runner.run_array(command_line,
                                              "callConsensus_preserved",
                                              log_file,
                                              sample_dirs_file,
                                              max_processes=max_cpu_cores,
                                              wait_for=[job_id_merge_sites2])

    progress("Step 8.1 - Create the SNP matrix")
    log_file = os.path.join(log_dir, "snpMatrix.log")
    output_file = os.path.join(work_dir, "snpma.fasta")
    extra_params = os.environ.get("SnpMatrix_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file
    job_id_snp_matrix = runner.run(command_line,
                                   "snpMatrix",
                                   log_file,
                                   wait_for_array=[job_id_call_consensus])

    progress("Step 8.2 - Create the SNP matrix")
    log_file = os.path.join(log_dir, "snpMatrix_preserved.log")
    output_file = os.path.join(work_dir, "snpma_preserved.fasta")
    extra_params = os.environ.get("SnpMatrix_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus_preserved.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2
    job_id_snp_matrix2 = runner.run(command_line,
                                    "snpMatrix_preserved",
                                    log_file,
                                    wait_for_array=[job_id_call_consensus2])

    progress("Step 9.1 - Create the reference sequence at SNP sites")
    log_file = os.path.join(log_dir, "snpReference.log")
    list_file = os.path.join(work_dir, "snplist.txt")
    output_file = os.path.join(work_dir, "referenceSNP.fasta")
    extra_params = os.environ.get("SnpReference_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path
    job_id_snp_reference = runner.run(command_line,
                                      "snpReference",
                                      log_file,
                                      wait_for_array=[job_id_call_consensus])

    progress("Step 9.2 - Create the reference sequence at SNP sites")
    log_file = os.path.join(log_dir, "snpReference_preserved.log")
    list_file = os.path.join(work_dir, "snplist_preserved.txt")
    output_file = os.path.join(work_dir, "referenceSNP_preserved.fasta")
    extra_params = os.environ.get("SnpReference_ExtraParams", "")
    command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path
    job_id_snp_reference2 = runner.run(command_line,
                                       "snpReference_preserved",
                                       log_file,
                                       wait_for_array=[job_id_call_consensus2])

    progress("Step 10.1 - Merge sample VCFs to create the multi-VCF file")
    if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""):
        log_file = os.path.join(log_dir, "mergeVcfs.log")
        output_file = os.path.join(work_dir, "snpma.vcf")
        extra_params = os.environ.get("MergeVcfs_ExtraParams", "")
        command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file
        job_id_merge_vcfs = runner.run(command_line,
                                       "mergeVcfs",
                                       log_file,
                                       wait_for_array=[job_id_call_consensus])
    else:
        print("Skipped per CallConsensus_ExtraParams configuration")

    progress("Step 10.2 - Merge sample VCFs to create the multi-VCF file")
    if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""):
        log_file = os.path.join(log_dir, "mergeVcfs_preserved.log")
        output_file = os.path.join(work_dir, "snpma_preserved.vcf")
        extra_params = os.environ.get("MergeVcfs_ExtraParams", "")
        command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-n consensus_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2
        job_id_merge_vcfs2 = runner.run(
            command_line,
            "mergeVcfs_preserved",
            log_file,
            wait_for_array=[job_id_call_consensus2])
    else:
        print("Skipped per CallConsensus_ExtraParams configuration")

    progress("Step 11.1 - Calculate SNP distance matrix")
    log_file = os.path.join(log_dir, "distance.log")
    input_file = os.path.join(work_dir, "snpma.fasta")
    pair_output_file = os.path.join(work_dir, "snp_distance_pairwise.tsv")
    matrix_output_file = os.path.join(work_dir, "snp_distance_matrix.tsv")
    command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file
    job_id_distance = runner.run(command_line,
                                 "distance",
                                 log_file,
                                 wait_for=[job_id_snp_matrix])

    progress("Step 11.2 - Calculate SNP distance matrix")
    log_file = os.path.join(log_dir, "distance_preserved.log")
    input_file = os.path.join(work_dir, "snpma_preserved.fasta")
    pair_output_file = os.path.join(work_dir,
                                    "snp_distance_pairwise_preserved.tsv")
    matrix_output_file = os.path.join(work_dir,
                                      "snp_distance_matrix_preserved.tsv")
    command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file
    job_id_distance2 = runner.run(command_line,
                                  "distance_preserved",
                                  log_file,
                                  wait_for=[job_id_snp_matrix2])

    progress("Step 12 - Collect metrics for each sample")
    log_file = os.path.join(log_dir, "collectMetrics.log")
    output_file = "{1}/metrics"
    extra_params = os.environ.get("CollectMetrics_ExtraParams", "")
    command_line = "cfsan_snp_pipeline collect_metrics" + force_flag + "-o " + output_file + ' ' + extra_params + " {1} " + reference_file_path
    job_id_collect_metrics = runner.run_array(
        command_line,
        "collectMetrics",
        log_file,
        sample_dirs_file,
        max_processes=max_cpu_cores,
        wait_for_array=[job_id_call_consensus, job_id_call_consensus2],
        slot_dependency=True)

    progress(
        "Step 13 - Combine the metrics across all samples into the metrics table"
    )
    log_file = os.path.join(log_dir, "combineMetrics.log")
    output_file = os.path.join(work_dir, "metrics.tsv")
    extra_params = os.environ.get("CombineMetrics_ExtraParams", "")
    command_line = "cfsan_snp_pipeline combine_metrics" + force_flag + "-n metrics -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file
    combine_metrics_job_id = runner.run(
        command_line,
        "combineMetrics",
        log_file,
        wait_for_array=[job_id_collect_metrics])

    # Decide whether to purge the intermediate output files upon successful completion.
    # Case 1: we are running on the HPC.  We always need to submit the purge task. It will decide to do nothing if there were errors.
    if job_queue_mgr is not None:  # HPC
        need_purge = args.purge  # need to submit the purge task, it might decide to do nothing if there were errors
    # Case 2: we are running locally and we know right now whether there were any errors.
    #     Case 2a: We are configured to stop on error, but the fact that we got this far means there were no errors -- so we need to purge.
    #     Case 2b: We are configured to ignore errors, so now we look for evidence of errors and purge if there were no errors.
    else:
        errors_detected = os.path.isfile(error_output_file)
        need_purge = args.purge and not errors_detected

    if need_purge:
        progress("Step 14 - Purge the intermediate output files")
        log_file = os.path.join(log_dir, "purge.log")
        command_line = "cfsan_snp_pipeline purge " + work_dir
        purge_job_id = runner.run(command_line,
                                  "purge",
                                  log_file,
                                  wait_for=[combine_metrics_job_id])

    # Step 15 - Notify user of any non-fatal errors accumulated during processing
    if os.path.isfile(error_output_file) and os.path.getsize(
            error_output_file) > 0 and not stop_on_error:
        print(
            "\nThere were errors processing some samples.\nSee the log file %s for a summary of errors."
            % error_output_file,
            file=sys.stderr)

    # Exit here to prevent showing the "cfsan_snp_pipeline run finished" message.  The jobs are queued, not finished yet.
    if job_queue_mgr is not None:  # HPC
        sys.exit(0)
    else:
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Elapsed time =", elapsed_time)
Пример #16
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file. The sample alignment is sorted, duplicate reads
    are marked, and reads realigned around indels.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.sorted.deduped.bai*
                sample_name_one/realign.target.intervals*
                sample_name_one/reads.sorted.deduped.indelrealigned.bam*

    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path],
                                       error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file",
                                       fastq_files,
                                       error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error(
            "Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[
        0]  # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    num_threads = args.threads

    #==========================================================================
    # verify jar files are in CLASSPATH
    #==========================================================================
    picard_jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH")
    if not picard_jar_file_path:
        utils.global_error(
            "Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable."
        )
    picard_version_str = utils.extract_version_str(
        "Picard", "java -jar " + picard_jar_file_path +
        " AddOrReplaceReadGroups --version 2>&1")

    gatk_jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK",
                                                      "CLASSPATH")
    if not gatk_jar_file_path:
        utils.global_error(
            "Error: cannot execute GATK. Define the path to GenomeAnalysisTK.jar in the CLASSPATH environment variable."
        )
    gatk_version_str = utils.extract_version_str(
        "GATK", "java -jar " + gatk_jar_file_path + " --version 2>&1")

    #==========================================================================
    # Enforce the proper SAMtools version
    #==========================================================================

    samtools_version_str = utils.extract_version_str(
        "SAMtools", "samtools 2>&1 > /dev/null")
    samtools_version = samtools_version_str.split()[-1]  # just the number
    if samtools_version < "1.4":
        utils.global_error(
            "The installed %s is not supported.  Version 1.4 or higher is required."
            % samtools_version_str)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# %s has already been aligned to %s.  Use the -f option to force a rebuild."
            % (sample_id, reference_id))
    else:
        #==========================================================================
        # Construct the command line to execute bowtie2 or smalt
        #==========================================================================

        # The read group identifies reads from a single run and lane
        read_group_tags = fastq.construct_read_group_tags(
            sample_fastq_file1, sample_id)

        # Make up dummy read group tags if the read group information is missing from the fastq files.
        # GATK components require these tags.
        if read_group_tags is None:
            id = "1"
            sm = sample_id
            lb = "1"
            pl = None
            pu = sample_id
            read_group_tags = fastq.ReadGroupTags(id, sm, lb, pl, pu)

        if snp_pipeline_aligner == "bowtie2":
            version_str = utils.extract_version_str("bowtie2",
                                                    "bowtie2 --version")

            # Substitute the default parameters if the user did not specify bowtie parameters
            os.environ["Bowtie2Align_ExtraParams"] = os.environ.get(
                "Bowtie2Align_ExtraParams") or "--reorder"

            # Set the number of threads to use
            utils.configure_process_threads("Bowtie2Align_ExtraParams", "-p",
                                            num_threads, None)
            bowtie2_align_extra_params = os.environ["Bowtie2Align_ExtraParams"]

            # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
            # The read group tags are used by some downstream tools, like Picard and GATK.
            read_group_params = ""
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            if read_group_tags.PL is not None:
                read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

            # Build the command with options depending on whether the fastq files are paired
            command_line = "bowtie2 " + read_group_params + " " + bowtie2_align_extra_params + " -x " + reference_base_path
            if sample_fastq_file2:
                command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
            else:
                command_line += " -U " + sample_fastq_file1

        elif snp_pipeline_aligner == "smalt":
            version_str = utils.extract_version_str("smalt", "smalt version")

            # Substitute the default parameters if the user did not specify smalt parameters
            os.environ["SmaltAlign_ExtraParams"] = os.environ.get(
                "SmaltAlign_ExtraParams") or "-O"

            # Set the number of threads to use
            utils.configure_process_threads("SmaltAlign_ExtraParams", "-n",
                                            num_threads, None)
            smalt_align_extra_params = os.environ["SmaltAlign_ExtraParams"]

            # Don't use the -i 1000 option if the fastq file is unpaired
            if not sample_fastq_file2:
                smalt_align_params = re.sub(
                    "-i[ ]+[0-9]+", '',
                    smalt_align_extra_params)  # regex substitute

            command_line = "smalt map " + smalt_align_extra_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (
                sample_fastq_file2 or "")

        #==========================================================================
        # Run the command to execute bowtie2 or smalt
        #==========================================================================
        verbose_print("# Align sequence %s to reference %s" %
                      (sample_id, reference_id))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sam_file)

        #==========================================================================
        # When using smalt, assign read groups in a separate step.
        # This is already done when using bowtie2.
        #==========================================================================
        if snp_pipeline_aligner == "smalt" and read_group_tags:
            smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
            shutil.move(sam_file, smalt_sam_file)
            jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
            command_line = "java " + jvm_params + " -jar " + picard_jar_file_path + " AddOrReplaceReadGroups"
            command_line += " I=" + smalt_sam_file
            command_line += " O=" + sam_file
            command_line += " RGID=" + read_group_tags.ID
            command_line += " RGSM=" + read_group_tags.SM
            command_line += " RGLB=" + read_group_tags.LB
            if read_group_tags.PL is None:
                command_line += " RGPL=unknown"  # Picard requires this command line option
            else:
                command_line += " RGPL=" + read_group_tags.PL
            command_line += " RGPU=" + read_group_tags.PU
            verbose_print("")
            verbose_print("# Assign read group id %s" % (read_group_tags.ID))
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
        verbose_print("")

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        # Substitute the default parameters if the user did not specify samtools view parameters
        os.environ["SamtoolsSamFilter_ExtraParams"] = os.environ.get(
            "SamtoolsSamFilter_ExtraParams") or "-F 4"

        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSamFilter_ExtraParams",
                                        ["-@", "--threads"], num_threads, None)
        samtools_samfilter_params = os.environ["SamtoolsSamFilter_ExtraParams"]

        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print(
            "# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file],
                                               sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print(
            "# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild."
            % sample_id)
    else:
        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSort_ExtraParams",
                                        ["-@", "--threads"], num_threads, None)
        samtools_sort_extra_params = os.environ["SamtoolsSort_ExtraParams"]

        command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file
        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads",
                                            "true").lower() == "true"
    input_file = sorted_bam_file
    output_file = utils.add_file_suffix(input_file,
                                        ".deduped",
                                        enable=remove_duplicate_reads)
    if remove_duplicate_reads:
        # Check for fresh deduped bam file; if not, remove duplicate reads
        needs_rebuild = utils.target_needs_rebuild([input_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            picard_jvm_extra_params = os.environ.get(
                "PicardJvm_ExtraParams") or ""
            picard_mark_duplicates_extra_params = os.environ.get(
                "PicardMarkDuplicates_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
            command_line = "java " + picard_jvm_extra_params + " -jar " + picard_jar_file_path + " MarkDuplicates INPUT=" + input_file + " OUTPUT=" + output_file + " METRICS_FILE=" + os.path.join(
                sample_dir, "duplicate_reads_metrics.txt"
            ) + tmp_option + ' ' + picard_mark_duplicates_extra_params
            verbose_print("# Mark duplicate reads in bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file,
                                               "picard MarkDuplicates")
            verbose_print("")

    #==========================================================================
    # Next three steps are part of local realignment around indels
    #==========================================================================
    enable_local_realignment = os.environ.get("EnableLocalRealignment",
                                              "true").lower() == "true"

    #==========================================================================
    # Index the sorted bam file prior to RealignerTargetCreator
    #==========================================================================

    input_file = output_file  # output from last step becomes input to this step
    if enable_local_realignment:
        # Check for fresh bai file; if not, index it
        bam_index_file = input_file[:-3] + "bai"
        needs_rebuild = utils.target_needs_rebuild([input_file],
                                                   bam_index_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Bam file index is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            # Set the number of threads to use
            utils.configure_process_threads("SamtoolsIndex_ExtraParams", "-@",
                                            num_threads, None)
            samtools_index_extra_params = os.environ[
                "SamtoolsIndex_ExtraParams"]

            command_line = "samtools index " + samtools_index_extra_params + ' ' + input_file + ' ' + bam_index_file
            verbose_print("# Index bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % samtools_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(bam_index_file,
                                               "samtools index")
            verbose_print("")

    #==========================================================================
    # Identify targets for realignment
    #==========================================================================

    if enable_local_realignment:
        # Check for fresh realign_targets_file file; if not run RealignerTargetCreator
        realign_targets_file = os.path.join(sample_dir,
                                            "realign.target.intervals")
        needs_rebuild = utils.target_needs_rebuild(
            [input_file, bam_index_file], realign_targets_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Realign targets file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            # Set the number of threads to use
            utils.configure_process_threads(
                "RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"],
                num_threads, None)
            realigner_target_creator_extra_params = os.environ[
                "RealignerTargetCreator_ExtraParams"]

            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T RealignerTargetCreator -R " + reference_file_path + " -I " + input_file + " -o " + realign_targets_file + ' ' + realigner_target_creator_extra_params
            verbose_print("# Identify targets for realignment.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(realign_targets_file,
                                               "GATK RealignerTargetCreator",
                                               empty_ok=True)
            verbose_print("")

    #==========================================================================
    # Realign around indels
    #==========================================================================

    output_file = utils.add_file_suffix(input_file,
                                        ".indelrealigned",
                                        enable=enable_local_realignment)
    if enable_local_realignment:
        # Check for fresh indelrealigned bam file; if not run IndelRealigner
        needs_rebuild = utils.target_needs_rebuild(
            [input_file, bam_index_file, realign_targets_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print(
                "# Indelrealigned bam file is already freshly created for %s.  Use the -f option to force a rebuild."
                % sample_id)
        else:
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            indel_realigner_extra_params = os.environ.get(
                "IndelRealigner_ExtraParams") or ""
            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T IndelRealigner -R " + reference_file_path + " -targetIntervals " + realign_targets_file + " -I " + input_file + " -o " + output_file + ' ' + indel_realigner_extra_params
            verbose_print("# Realign around indels")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file,
                                               "GATK IndelRealigner")
Пример #17
0
def map_reads(args):
    """Align reads to the reference.

    Execute an external program (bowtie2 or smalt) to map the fastq reads
    to a reference file. The sample alignment is sorted, duplicate reads
    are marked, and reads realigned around indels.

    The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt.

    This function expects, or creates '(*)', the following files arranged
    in the following way:
            reference
                referenceFile.fasta
            samples
                sample_name_one/sampleFastqFile_1.fastq
                sample_name_one/sampleFastqFile_2.fastq
                sample_name_one/reads.sam*
                sample_name_one/reads.unsorted.bam*
                sample_name_one/reads.sorted.bam*
                sample_name_one/reads.sorted.deduped.bam*
                sample_name_one/reads.sorted.deduped.bai*
                sample_name_one/realign.target.intervals*
                sample_name_one/reads.sorted.deduped.indelrealigned.bam*

    The fastq files may be either compressed with gzip or uncompressed.

    The reverse fastq file is optional.

    All the input files are created outside of this function. The package
    documentation provides an example of preparing these files based on the
    lambda_virus sequence that is used as one test for this package.

    Parameters
    ----------
    args : argparse.Namespace
        referenceFile : File path of the reference fasta file
        sampleFastqFile1 : File path of the forward fastq file
        sampleFastqFile2 : Optional file path of the reverse fastq file
    """
    utils.print_log_header()
    utils.print_arguments(args)

    #==========================================================================
    # Validate inputs
    #==========================================================================

    # Verify reference fasta file exists and is not empty
    reference_file_path = args.referenceFile
    utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global")

    # Verify fastq files exist and are not empty
    sample_fastq_file1 = args.sampleFastqFile1
    sample_fastq_file2 = args.sampleFastqFile2
    fastq_files = [sample_fastq_file1]
    if sample_fastq_file2:
        fastq_files.append(sample_fastq_file2)

    utils.verify_non_empty_input_files("Sample file", fastq_files, error_handler="sample")

    # The environment variable SnpPipeline_Aligner selects between bowtie2 and smalt
    snp_pipeline_aligner = os.environ.get("SnpPipeline_Aligner") or "bowtie2"
    snp_pipeline_aligner = snp_pipeline_aligner.lower()
    if snp_pipeline_aligner not in ["bowtie2", "smalt"]:
        utils.global_error("Error: only bowtie2 and smalt aligners are supported.")

    sample_dir = os.path.dirname(sample_fastq_file1)
    sample_id = utils.sample_id_from_file(sample_fastq_file1)
    reference_base_path = os.path.splitext(reference_file_path)[0] # strip the file extension
    reference_id = os.path.basename(reference_base_path)

    num_threads = args.threads

    #==========================================================================
    # verify jar files are in CLASSPATH
    #==========================================================================
    picard_jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH")
    if not picard_jar_file_path:
        utils.global_error("Error: cannot execute Picard. Define the path to picard.jar in the CLASSPATH environment variable.")
    picard_version_str = utils.extract_version_str("Picard", "java -jar " + picard_jar_file_path + " AddOrReplaceReadGroups --version 2>&1")

    gatk_jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK", "CLASSPATH")
    if not gatk_jar_file_path:
        utils.global_error("Error: cannot execute GATK. Define the path to GenomeAnalysisTK.jar in the CLASSPATH environment variable.")
    gatk_version_str = utils.extract_version_str("GATK", "java -jar " + gatk_jar_file_path + " --version 2>&1")

    #==========================================================================
    # Enforce the proper SAMtools version
    #==========================================================================

    samtools_version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null")
    samtools_version = samtools_version_str.split()[-1] # just the number
    if samtools_version < "1.4":
        utils.global_error("The installed %s is not supported.  Version 1.4 or higher is required." % samtools_version_str)

    #==========================================================================
    # Check if alignment to reference has already been done
    #==========================================================================
    sam_file = os.path.join(sample_dir, "reads.sam")
    source_files = [sample_fastq_file1]
    if sample_fastq_file2:
        source_files.append(sample_fastq_file2)
    if snp_pipeline_aligner == "bowtie2":
        source_files.append(reference_base_path + ".rev.1.bt2")
    elif snp_pipeline_aligner == "smalt":
        source_files.append(reference_base_path + ".smi")
    needs_rebuild = utils.target_needs_rebuild(source_files, sam_file)

    if not args.forceFlag and not needs_rebuild:
        verbose_print("# %s has already been aligned to %s.  Use the -f option to force a rebuild." % (sample_id, reference_id))
    else:
        #==========================================================================
        # Construct the command line to execute bowtie2 or smalt
        #==========================================================================

        # The read group identifies reads from a single run and lane
        read_group_tags = fastq.construct_read_group_tags(sample_fastq_file1, sample_id)

        # Make up dummy read group tags if the read group information is missing from the fastq files.
        # GATK components require these tags.
        if read_group_tags is None:
            id = "1"
            sm = sample_id
            lb = "1"
            pl = None
            pu = sample_id
            read_group_tags = fastq.ReadGroupTags(id, sm, lb, pl, pu)

        if snp_pipeline_aligner == "bowtie2":
            version_str = utils.extract_version_str("bowtie2", "bowtie2 --version")

            # Substitute the default parameters if the user did not specify bowtie parameters
            os.environ["Bowtie2Align_ExtraParams"] = os.environ.get("Bowtie2Align_ExtraParams") or "--reorder"

            # Set the number of threads to use
            utils.configure_process_threads("Bowtie2Align_ExtraParams", "-p", num_threads, None)
            bowtie2_align_extra_params = os.environ["Bowtie2Align_ExtraParams"]

            # Specify the read group and sample tags here, --rg tags cannot be specified without ID.
            # The read group tags are used by some downstream tools, like Picard and GATK.
            read_group_params = ""
            read_group_params += " --rg-id " + read_group_tags.ID
            read_group_params += " --rg SM:" + read_group_tags.SM
            read_group_params += " --rg LB:" + read_group_tags.LB
            if read_group_tags.PL is not None:
                read_group_params += " --rg PL:" + read_group_tags.PL
            read_group_params += " --rg PU:" + read_group_tags.PU

            # Build the command with options depending on whether the fastq files are paired
            command_line = "bowtie2 " + read_group_params + " " + bowtie2_align_extra_params + " -x " + reference_base_path
            if sample_fastq_file2:
                command_line += " -1 " + sample_fastq_file1 + " -2 " + sample_fastq_file2
            else:
                command_line += " -U " + sample_fastq_file1

        elif snp_pipeline_aligner == "smalt":
            version_str = utils.extract_version_str("smalt", "smalt version")

            # Substitute the default parameters if the user did not specify smalt parameters
            os.environ["SmaltAlign_ExtraParams"] = os.environ.get("SmaltAlign_ExtraParams") or  "-O"

            # Set the number of threads to use
            utils.configure_process_threads("SmaltAlign_ExtraParams", "-n", num_threads, None)
            smalt_align_extra_params = os.environ["SmaltAlign_ExtraParams"]

            # Don't use the -i 1000 option if the fastq file is unpaired
            if not sample_fastq_file2:
                smalt_align_params = re.sub("-i[ ]+[0-9]+", '', smalt_align_extra_params) # regex substitute

            command_line = "smalt map " + smalt_align_extra_params + " " + reference_base_path + " " + sample_fastq_file1 + " " + (sample_fastq_file2 or "")

        #==========================================================================
        # Run the command to execute bowtie2 or smalt
        #==========================================================================
        verbose_print("# Align sequence %s to reference %s" % (sample_id, reference_id))
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % version_str)
        command.run(command_line, sam_file)

        #==========================================================================
        # When using smalt, assign read groups in a separate step.
        # This is already done when using bowtie2.
        #==========================================================================
        if snp_pipeline_aligner == "smalt" and read_group_tags:
            smalt_sam_file = os.path.join(sample_dir, "reads.smalt.sam")
            shutil.move(sam_file, smalt_sam_file)
            jvm_params = os.environ.get("PicardJvm_ExtraParams") or ""
            command_line = "java " + jvm_params + " -jar " + picard_jar_file_path + " AddOrReplaceReadGroups"
            command_line += " I=" + smalt_sam_file
            command_line += " O=" + sam_file
            command_line += " RGID=" + read_group_tags.ID
            command_line += " RGSM=" + read_group_tags.SM
            command_line += " RGLB=" + read_group_tags.LB
            if read_group_tags.PL is None:
                command_line += " RGPL=unknown"  # Picard requires this command line option
            else:
                command_line += " RGPL=" + read_group_tags.PL
            command_line += " RGPU=" + read_group_tags.PU
            verbose_print("")
            verbose_print("# Assign read group id %s" % (read_group_tags.ID))
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
        verbose_print("")

    #==========================================================================
    # Convert sam to bam file, selecting only the mapped reads
    #==========================================================================

    # Check for fresh bam file; if not, convert to bam file with only mapped reads
    unsorted_bam_file = os.path.join(sample_dir, "reads.unsorted.bam")
    needs_rebuild = utils.target_needs_rebuild([sam_file], unsorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Unsorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        # Substitute the default parameters if the user did not specify samtools view parameters
        os.environ["SamtoolsSamFilter_ExtraParams"] = os.environ.get("SamtoolsSamFilter_ExtraParams") or "-F 4"

        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSamFilter_ExtraParams", ["-@", "--threads"], num_threads, None)
        samtools_samfilter_params = os.environ["SamtoolsSamFilter_ExtraParams"]

        command_line = "samtools view -S -b " + samtools_samfilter_params + " -o " + unsorted_bam_file + ' ' + sam_file
        verbose_print("# Convert sam file to bam file with only mapped positions.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(unsorted_bam_file, "samtools view")
        verbose_print("")

    #==========================================================================
    # Sort the BAM file
    #==========================================================================

    # Check for fresh sorted bam file; if not, sort it
    sorted_bam_file = os.path.join(sample_dir, "reads.sorted.bam")
    needs_rebuild = utils.target_needs_rebuild([unsorted_bam_file], sorted_bam_file)
    if not args.forceFlag and not needs_rebuild:
        verbose_print("# Sorted bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
    else:
        # Set the number of threads to use
        utils.configure_process_threads("SamtoolsSort_ExtraParams", ["-@", "--threads"], num_threads, None)
        samtools_sort_extra_params = os.environ["SamtoolsSort_ExtraParams"]

        command_line = "samtools sort " + samtools_sort_extra_params + " -o " + sorted_bam_file + ' ' + unsorted_bam_file
        verbose_print("# Convert bam to sorted bam file.")
        verbose_print("# %s %s" % (utils.timestamp(), command_line))
        verbose_print("# %s" % samtools_version_str)
        command.run(command_line, sys.stdout)
        utils.sample_error_on_missing_file(sorted_bam_file, "samtools sort")
        verbose_print("")

    #==========================================================================
    # Mark duplicate reads, so they will be ignored in subsequent steps
    #==========================================================================

    remove_duplicate_reads = os.environ.get("RemoveDuplicateReads", "true").lower() == "true"
    input_file = sorted_bam_file
    output_file = utils.add_file_suffix(input_file, ".deduped", enable=remove_duplicate_reads)
    if remove_duplicate_reads:
        # Check for fresh deduped bam file; if not, remove duplicate reads
        needs_rebuild = utils.target_needs_rebuild([input_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Deduped bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            picard_jvm_extra_params = os.environ.get("PicardJvm_ExtraParams") or ""
            picard_mark_duplicates_extra_params = os.environ.get("PicardMarkDuplicates_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            tmp_option = " TMP_DIR=" + tmpdir if tmpdir else ""
            command_line = "java " + picard_jvm_extra_params + " -jar " + picard_jar_file_path + " MarkDuplicates INPUT=" + input_file + " OUTPUT=" + output_file + " METRICS_FILE=" + os.path.join(sample_dir, "duplicate_reads_metrics.txt") + tmp_option + ' ' + picard_mark_duplicates_extra_params
            verbose_print("# Mark duplicate reads in bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % picard_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file, "picard MarkDuplicates")
            verbose_print("")

    #==========================================================================
    # Next three steps are part of local realignment around indels
    #==========================================================================
    enable_local_realignment = os.environ.get("EnableLocalRealignment", "true").lower() == "true"

    #==========================================================================
    # Index the sorted bam file prior to RealignerTargetCreator
    #==========================================================================

    input_file = output_file # output from last step becomes input to this step
    if enable_local_realignment:
        # Check for fresh bai file; if not, index it
        bam_index_file = input_file[:-3] + "bai"
        needs_rebuild = utils.target_needs_rebuild([input_file], bam_index_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Bam file index is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            # Set the number of threads to use
            utils.configure_process_threads("SamtoolsIndex_ExtraParams", "-@", num_threads, None)
            samtools_index_extra_params = os.environ["SamtoolsIndex_ExtraParams"]

            command_line = "samtools index " + samtools_index_extra_params + ' ' + input_file + ' ' + bam_index_file
            verbose_print("# Index bam file.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % samtools_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(bam_index_file, "samtools index")
            verbose_print("")


    #==========================================================================
    # Identify targets for realignment
    #==========================================================================

    if enable_local_realignment:
        # Check for fresh realign_targets_file file; if not run RealignerTargetCreator
        realign_targets_file = os.path.join(sample_dir, "realign.target.intervals")
        needs_rebuild = utils.target_needs_rebuild([input_file, bam_index_file], realign_targets_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Realign targets file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            classpath = os.environ.get("CLASSPATH")
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            # Set the number of threads to use
            utils.configure_process_threads("RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"], num_threads, None)
            realigner_target_creator_extra_params= os.environ["RealignerTargetCreator_ExtraParams"]

            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T RealignerTargetCreator -R " + reference_file_path + " -I " + input_file + " -o " + realign_targets_file  + ' ' + realigner_target_creator_extra_params
            verbose_print("# Identify targets for realignment.")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(realign_targets_file, "GATK RealignerTargetCreator", empty_ok=True)
            verbose_print("")

    #==========================================================================
    # Realign around indels
    #==========================================================================

    output_file = utils.add_file_suffix(input_file, ".indelrealigned", enable=enable_local_realignment)
    if enable_local_realignment:
        # Check for fresh indelrealigned bam file; if not run IndelRealigner
        needs_rebuild = utils.target_needs_rebuild([input_file, bam_index_file, realign_targets_file], output_file)
        if not args.forceFlag and not needs_rebuild:
            verbose_print("# Indelrealigned bam file is already freshly created for %s.  Use the -f option to force a rebuild." % sample_id)
        else:
            gatk_jvm_extra_params = os.environ.get("GatkJvm_ExtraParams") or ""
            tmpdir = os.environ.get("TMPDIR") or os.environ.get("TMP_DIR")
            if tmpdir and "-Djava.io.tmpdir" not in gatk_jvm_extra_params:
                gatk_jvm_extra_params += " -Djava.io.tmpdir=" + tmpdir

            indel_realigner_extra_params = os.environ.get("IndelRealigner_ExtraParams") or ""
            command_line = "java " + gatk_jvm_extra_params + " -jar " + gatk_jar_file_path + " -T IndelRealigner -R " + reference_file_path + " -targetIntervals " + realign_targets_file + " -I " + input_file + " -o " + output_file  + ' ' + indel_realigner_extra_params
            verbose_print("# Realign around indels")
            verbose_print("# %s %s" % (utils.timestamp(), command_line))
            verbose_print("# %s" % gatk_version_str)
            command.run(command_line, sys.stdout)
            utils.sample_error_on_missing_file(output_file, "GATK IndelRealigner")