Exemplo n.º 1
0
def getMask(globs, cmds, vcf_file):
    # Get the sites to be masked into a bed file.

    mask_bedfile = os.path.join(globs['iterfadir'],
                                "iter-" + globs['iter-str'] + "-masksites.bed")
    if globs['diploid']:
        mask_bedfile = mask_bedfile.replace("-masksites.bed",
                                            "-diploid-masksites.bed")

    cmd = "zgrep \"\./\.\" " + vcf_file + " | awk '{{OFS=\"\t\"; if ($0 !~ /\#/); print $1, $2-1, $2}}' | bedtools merge -i - > " + mask_bedfile
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Get mask sites",
        'outfile': mask_bedfile,
        'logfile': "",
        'start': False
    }

    run = True
    if globs['resume']:
        if os.path.isfile(mask_bedfile) and os.stat(mask_bedfile).st_size != 0:
            PC.report_step(globs, cmds, cmd, "RESUME",
                           "previous output found: " + mask_bedfile)
            run = False

    if run:
        if not globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)

            if os.path.isfile(
                    mask_bedfile) and os.stat(mask_bedfile).st_size != 0:
                num_sites = str(len(open(mask_bedfile, "r").readlines()))
                PC.report_step(globs, cmds, cmd, "SUCCESS",
                               num_sites + " mask sites read: " + mask_bedfile)
            else:
                PC.report_step(
                    globs, cmds, cmd, "ERROR!",
                    "Mask sites file not found or empty: " + mask_bedfile)
                globs['exit-code'] = 1
                PC.endProg(globs)

        else:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)

    return mask_bedfile, cmds
Exemplo n.º 2
0
def selectSNPs(globs, cmds, vcf_file):
    # Run the command to select only SNPs from a VCF file.

    gatk_cmd = globs[
        'gatk-path'] + " SelectVariants -V " + vcf_file + " -O " + globs[
            'iter-final-vcf'] + " -select-type SNP -xl-select-type INDEL -xl-select-type MIXED -xl-select-type SYMBOLIC"
    cmd_num = PC.getCMDNum(globs, len(cmds))
    cmds[gatk_cmd] = {
        'cmd-num': cmd_num,
        'desc': "Select SNPs",
        'outfile': vcf_file,
        'logfile': globs['iter-final-vcf-log'],
        'start': False
    }
    exit_flag = PC.runCMD(gatk_cmd, globs, cmds, True)
    PC.exitCheck(exit_flag, globs)
    # End the program if an error is encountered

    return cmds
Exemplo n.º 3
0
def indexVCF(globs, cmds, vcf_file):

    index_file = vcf_file + ".tbi"
    cur_logfile = cur_logfile = os.path.join(
        globs['iterlogdir'], "tabix-" + globs['iter-str'] + ".log")

    index_cmd = "tabix -fp vcf " + vcf_file
    cmds[index_cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Index VCF",
        'outfile': index_file,
        'logfile': cur_logfile,
        'start': False
    }
    exit_flag = PC.runCMD(index_cmd, globs, cmds, True)
    PC.exitCheck(exit_flag, globs)
    # End the program if an error is encountered

    return cmds
Exemplo n.º 4
0
def gatherVCFs(globs, cmds):
    # Combine the region VCFs from haplotypeCallerMulti.

    # vcf_file = os.path.join(globs['itervcfdir'], "iter-" + globs['iter-str'] + "-filter.vcf.gz");
    # cur_logfile = os.path.join(globs['iterlogdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log");
    # if globs['last-iter'] and globs['indels']:
    #     vcf_file = vcf_file.replace("-filter.vcf.gz", "-filter-final.vcf.gz")
    #     cur_logfile = cur_logfile.replace(".log", "-final.log");
    params_file = os.path.join(
        globs['itervcfdir'],
        "iter-" + globs['iter-str'] + "-gathervcfs-params.txt")

    # infile_ext = "-snps-filter.vcf.gz";
    # if globs['last-iter']:
    #     if globs['indels']:
    #         infile_ext = "-filter.vcf.gz";
    infile_ext = "-filter.vcf.gz"

    with open(params_file, "w") as paramsfile:
        for scaff in globs['scaffolds']:
            scaff_vcf = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + infile_ext)
            paramsfile.write("-I " + scaff_vcf + "\n")
    gatk_cmd = globs[
        'gatk-path'] + " GatherVcfs --arguments_file " + params_file + " -O " + globs[
            'iter-gather-vcf']
    cmds[gatk_cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Gather VCFs",
        'outfile': globs['iter-gather-vcf'],
        'logfile': globs['iter-gather-vcf-log'],
        'start': False
    }

    exit_flag = PC.runCMD(gatk_cmd, globs, cmds, True)
    PC.exitCheck(exit_flag, globs)
    # End the program if an error is encountered

    return cmds
Exemplo n.º 5
0
def maskFa(globs, cmds, mask_bedfile, cur_ref):
    # Fix the headers from the consensus FASTA file.
    prev_iter = str(int(globs['iter-str']) - 1)
    if len(prev_iter) == 1:
        prev_iter = "0" + prev_iter

    if globs['indels']:
        cur_logfile = os.path.join(
            globs['iterlogdir'],
            "bedtools-maskfasta-" + globs['iter-str'] + ".log")
        mask_ref = os.path.join(globs['iterfadir'],
                                "iter-" + prev_iter + "-masked.fa")
    else:
        cur_logfile = os.path.join(
            globs['iterlogdir'],
            "bedtools-maskfasta-" + globs['iter-str'] + "-snps.log")
        mask_ref = os.path.join(globs['iterfadir'],
                                "iter-" + prev_iter + "-snps-masked.fa")

    mask_cmd = globs[
        'bedtools-path'] + " maskfasta -fi " + cur_ref + " -bed " + mask_bedfile
    if globs['softmask']:
        mask_cmd += " -soft"
    mask_cmd += " -fo " + mask_ref
    cmds[mask_cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Softmask reference",
        'outfile': mask_ref,
        'logfile': cur_logfile,
        'start': False
    }

    exit_flag = PC.runCMD(mask_cmd, globs, cmds, True)
    PC.exitCheck(exit_flag, globs)
    # End the program if an error is encountered

    return mask_ref, cmds
Exemplo n.º 6
0
def getScaffs(cur_fa, globs, cmds, report_status=True):
    # Save the list of scaffolds/contigs/chromosomes from a FASTA file to a text file.

    cmd = "grep \">\" " + cur_fa + " | sed 's/>//g'"  # > " + globs['scaffs'];
    # grep the number of scaffolds in the reference... I guess this could also be done by just reading
    # the number of lines in the index file...

    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Get ref scaffold IDs",
        'outfile': "",
        'logfile': "",
        'start': False
    }
    # Add the grep command to the global commands dict.

    if not globs['dryrun']:
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        cmd_result = subprocess.run(cmd,
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
        cur_scaffs = list(filter(None,
                                 cmd_result.stdout.decode().split("\n")))
        globs['scaffolds'] = [
            scaff[:scaff.index(" ")] if " " in scaff else scaff
            for scaff in cur_scaffs
        ]
        PC.report_step(globs, cmds, cmd, "SUCCESS",
                       str(len(globs['scaffolds'])) + " scaffold IDs read")
    else:
        PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        globs['scaffolds'] = []
    # Run the grep command and check for errors..

    return cmds
Exemplo n.º 7
0
def indexFa(globs, cmds, cur_ref):
    # Creates all reference fasta index files for subsequent iterations. For the first
    # iteration these are assumed to be created before the program is run.

    indices = ['dict', 'faidx', 'index']
    # The types of indices needed: .dict from picard, .fai from samtools, and the current --mapper index files.

    index_cmds = {}
    ref_ext = PC.detectRefExt(cur_ref, globs)
    # Detect whether the reference is compressed or not.

    for step in indices:
        if step == 'dict':
            cur_logfile = os.path.join(
                globs['iterlogdir'],
                "picard-dict-iter-" + globs['iter-str'] + ".log")
            dict_file = cur_ref.replace(ref_ext, ".dict")

            if os.path.isfile(dict_file) and globs['overwrite']:
                os.system("rm " + dict_file)

            picard_cmd = globs[
                'picard-path'] + " CreateSequenceDictionary R=" + cur_ref + " O=" + dict_file
            cmds[picard_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference dict",
                'outfile': dict_file,
                'logfile': cur_logfile,
                'start': False
            }
            index_cmds[picard_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference dict",
                'outfile': dict_file,
                'logfile': cur_logfile,
                'start': False
            }
        # Create the reference dictionary by running picard CreateSequenceDictionary

        if step == "faidx":
            cur_logfile = os.path.join(
                globs['iterlogdir'],
                "samtools-faidx-iter-" + globs['iter-str'] + ".log")
            faidx_file = cur_ref + ".fai"

            faidx_cmd = globs['samtools-path'] + " faidx " + cur_ref
            cmds[faidx_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference faidx",
                'outfile': faidx_file,
                'logfile': cur_logfile,
                'start': False
            }
            index_cmds[faidx_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference faidx",
                'outfile': faidx_file,
                'logfile': cur_logfile,
                'start': False
            }
        # Create the reference index by running samtools faidx

        if step == "index":
            if globs['mapper'] == "bwa":
                cur_logfile = os.path.join(
                    globs['iterlogdir'],
                    "bwa-index-iter-" + globs['iter-str'] + ".log")
                index_files = [
                    cur_ref + ".amb", cur_ref + ".ann", cur_ref + ".bwt",
                    cur_ref + ".pac", cur_ref + ".sa"
                ]

                index_cmd = globs['map-path'] + " index " + cur_ref
                cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create BWA reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
                index_cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create BWA reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
            # Create the reference index by running bwa index if --mapper is bwa

            elif globs['mapper'] == "hisat2":
                cur_logfile = os.path.join(
                    globs['iterlogdir'],
                    "hisat2-build-index-iter-" + globs['iter-str'] + ".log")
                index_file = cur_ref + ".ht"

                index_cmd = globs[
                    'mapper-path'] + "-build " + cur_ref + " " + cur_ref
                cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create hisat2 reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
                index_cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create hisat2 reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
            # Create the reference index by running hisat2-build if --mapper is hisat2

    index_procs = min(3, globs['num-procs'])
    pool = mp.Pool(processes=index_procs)
    for result in pool.starmap(PC.runCMD, ((index_cmd, globs, cmds, True)
                                           for index_cmd in index_cmds)):
        if result:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()
    # Run the index commands in parallel and check for errors.

    return cmds


#############################################################################
Exemplo n.º 8
0
def BWA(globs, cmds, cur_ref):
    # Map a set of reads with BWA mem.

    bwa_cmds, bamfiles = {}, []
    for lib_type in globs['libs']:
        # Generate a BWA command for each input fastq type.

        cur_logfile = os.path.join(
            globs['iterlogdir'],
            "bwa-mem-" + lib_type + "-iter-" + globs['iter-str'] + ".log")
        bamfile = os.path.join(
            globs['iterbamdir'],
            lib_type + "-iter-" + globs['iter-str'] + ".bam.gz")
        bamfiles.append(bamfile)
        # Get the bam file and log file for the current fastq file.

        rg_fields = ["ID", "PL", "PU", "LB", "SM"]
        rg_str = ["@RG"] + [
            field + ":" + globs['rg'][field] for field in rg_fields
        ]
        rg_str = "\\t".join(rg_str)
        # Gets the read group info from globs and parses it for BWA's -R option

        bwa_cmd = globs['mapper-path'] + " mem -t " + str(
            globs['map-t']
        ) + " -M -R '" + rg_str + "' " + cur_ref + " " + globs['libs'][lib_type]
        bwa_cmd += " | " + globs['samtools-path'] + " sort"
        bwa_cmd += " | " + globs['samtools-path'] + " view -bh -"
        bwa_cmd += " > " + bamfile
        # Generate the bwa mem command for the current fastq file, including passing output to samtools for sorting and
        # converting to .bam.

        cmd_num = PC.getCMDNum(globs, len(cmds))
        # Get the current command number for the log.

        cmds[bwa_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'outfile': bamfile,
            'logfile': cur_logfile,
            'start': False
        }
        # Save the bwa mem command to the global cmds dict.

        bwa_cmds[bwa_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'logfile': cur_logfile,
            'start': False
        }
        # Save the bwa mem command to the bwa_cmds dict.
    # Prepare the BWA commands for each library

    pool = mp.Pool(processes=globs['map-procs'])
    for result in pool.starmap(PC.runCMD, ((bwa_cmd, globs, cmds, True)
                                           for bwa_cmd in bwa_cmds)):
        if result:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()
    # Run the BWA commands across multiple processors, if specified
    # End the program if an error is encountered

    return bamfiles, cmds
Exemplo n.º 9
0
def markDups(globs, cmds, rg_bamfile):
    # Mark duplicates of a BAM file

    dupmet_file = os.path.join(globs['iterbamdir'],
                               "iter-" + globs['iter-str'] + "-dupmets.txt")
    # Get the duplicate metrics file name required to output by picard.

    mkdup_cmd = globs[
        'picard-path'] + " MarkDuplicates I=" + rg_bamfile + " O=" + globs[
            'iter-final-bam'] + " VALIDATION_STRINGENCY=LENIENT M=" + dupmet_file + " CREATE_INDEX=true"
    if globs['tmpdir'] != "System default.":
        mkdup_cmd += " TMP_DIR=\"" + globs['tmpdir'] + "\""
    # Generate the MarkDuplicates command.

    cmds[mkdup_cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Mark duplicates",
        'outfile': globs['iter-final-bam'],
        'logfile': globs['iter-final-bam-log'],
        'start': False
    }
    # Add the MarkDuplicates command to the global cmds dict.

    exit_flag = PC.runCMD(mkdup_cmd, globs, cmds, True)
    PC.exitCheck(exit_flag, globs)
    # Run the MarkDuplicates command and check for errors.

    return cmds


#############################################################################
#############################################################################

# def indexBAM(globs, cmds):
# # Index a BAM file with samtools.
# # Now done durign MarkDuplicates with CREATE_INDEX=true

#     cur_logfile = os.path.join(globs['iterlogdir'], "samtools-index-iter-" + globs['iter-str'] + ".log");
#     index_bamfile = globs['iter-final-bam'] + ".bai";

#     index_cmd = globs['samtools-path'] + " index " + globs['iter-final-bam'];
#     cmds[index_cmd] = { 'cmd-num' : PC.getCMDNum(globs, len(cmds)), 'desc' : "Index BAM file", 'outfile' : index_bamfile, 'logfile' : cur_logfile, 'start' : False };
#     exit_flag = PC.runCMD(index_cmd, globs, cmds, True);
#     PC.exitCheck(exit_flag, globs);

#     return cmds;

#############################################################################

# def hisat2Index(globs, cmds, cur_ref):
# # Index a FASTA file with hisat2.
#     cur_logfile = os.path.join(globs['iterlogdir'], "hisat2-build-iter-" + globs['iter-str'] + ".log");
#     # Get the name of the logfile for the index command.

#     hisat2_build_cmd = globs['mapper-path'] + "-build " + cur_ref + " " + cur_ref;
#     # Generate the hisat-build command.

#     cmd_num = PC.getCMDNum(globs, len(cmds));
#     # Get the current command number for the log.

#     cmds[hisat2_build_cmd] = { 'cmd-num' : cmd_num, 'desc' : "hisat2-build index", 'outfile' : cur_ref,  'logfile' : cur_logfile, 'start' : False };
#     # Save the hisat-build command to the global cmds dict.

#     exit_flag = PC.runCMD(hisat2_build_cmd, globs, cmds, True);
#     PC.exitCheck(exit_flag, globs);
#     # Run the hisat-build command and check for errors.

#############################################################################

# def addRG(globs, cmds, bamfiles):
# # Run Picard's AddOrReplaceReadGroups on a merged BAM file.
# # This is now done during read mapping with bwa's -R option.

#     bwa_cmds, rg_bamfiles = {}, [];
#     for lib_type in bamfiles:
#         bamfile = bamfiles[lib_type];
#         cur_logfile = os.path.join(globs['iterlogdir'], "picard-add-rg-" + lib_type + "-iter-" + globs['iter-str'] + ".log");
#         rg_bamfile = os.path.join(globs['iterbamdir'], lib_type + "-iter-" + globs['iter-str'] + "-rg.bam.gz");
#         bamfiles.append(bamfile);

#         rg_cmd = globs['picard-path'] + " AddOrReplaceReadGroups I=" + bamfile + " O=" + rg_bamfile + " SO=coordinate LB=" + lib_type + " PL=illumina PU=misc SM=" + rg_lib + " VALIDATION_STRINGENCY=LENIENT";
#         if globs['tmpdir'] != "System default.":
#             rg_cmd += " TMP_DIR=\"" + globs['tmpdir'] + "\"";

#         bwa_cmd = globs['bwa-path'] + " mem -t " + str(globs['bwa-t']) + " " + cur_ref + " " + globs['libs'][lib_type] + " | " + globs['samtools-path'] + " view -bh - > " + bamfile;
#         cmd_num = PC.getCMDNum(globs, len(cmds))
#         cmds[bwa_cmd] = { 'cmd-num' : cmd_num, 'desc' : "BWA " + lib_type + " read mapping", 'outfile' : bamfile,  'logfile' : cur_logfile, 'start' : False };

#         bwa_cmds[bwa_cmd] = { 'cmd-num' : cmd_num, 'desc' : "BWA " + lib_type + " read mapping", 'logfile' : cur_logfile, 'start' : False };
#     # Prepare the BWA commands for each library

#     cur_logfile = os.path.join(globs['iterlogdir'], "picard-add-rg-iter-" + globs['iter-str'] + ".log");
#     rg_bamfile = os.path.join(globs['iterbamdir'], "merged-rg-iter-" + globs['iter-str'] + ".bam.gz");
#     rg_lib = "rg-iter-" + globs['iter-str'];

#     rg_cmd = globs['picard-path'] + " AddOrReplaceReadGroups I=" + merged_bamfile + " O=" + rg_bamfile + " SO=coordinate LB=" + rg_lib + " PL=illumina PU=misc SM=" + rg_lib + " VALIDATION_STRINGENCY=LENIENT";
#     if globs['tmpdir'] != "System default.":
#         rg_cmd += " TMP_DIR=\"" + globs['tmpdir'] + "\"";

#     cmds[rg_cmd] = { 'cmd-num' : PC.getCMDNum(globs, len(cmds)), 'desc' : "Add read groups", 'outfile' : rg_bamfile, 'logfile' : cur_logfile, 'start' : False };
#     exit_flag = PC.runCMD(rg_cmd, globs, cmds, True);
#     PC.exitCheck(exit_flag, globs);

#     return rg_bamfile, cmds;

#############################################################################
Exemplo n.º 10
0
def mergeBam(globs, cmds, bamfiles):
    # Merge BAM files from different library types.

    cur_logfile = os.path.join(
        globs['iterlogdir'],
        "picard-merge-bam-iter-" + globs['iter-str'] + ".log")
    merged_bamfile = os.path.join(
        globs['iterbamdir'], "merged-iter-" + globs['iter-str'] + ".bam.gz")
    # Get the log file and merged bam file name to output to.

    if len(bamfiles) > 1:
        # We only need to run picard if there are multiple bam files from mapping

        merge_cmd = globs['picard-path'] + " MergeSamFiles "
        for bamfile in bamfiles:
            merge_cmd += "I=" + bamfile + " "
        if globs['tmpdir'] != "System default.":
            merge_cmd += "TMP_DIR=\"" + globs['tmpdir'] + "\" "
        if not globs['mkdups']:
            merge_cmd += "CREATE_INDEX=true "
        merge_cmd += "USE_THREADING=TRUE VALIDATION_STRINGENCY=LENIENT O=" + merged_bamfile
        # Generate the MergeSamFiles command.

        cmds[merge_cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Merge BAM files",
            'outfile': merged_bamfile,
            'logfile': cur_logfile,
            'start': False
        }
        # Add the MergeSamFiles command to the global cmds dict.

        exit_flag = PC.runCMD(merge_cmd, globs, cmds, True)
        PC.exitCheck(exit_flag, globs)
        # Run the command and check for errors.

    else:
        # If there was only one bam file from mapping we don't need to merge, just move it to the expected location.

        merge_cmd = "mv " + bamfiles[0] + " " + merged_bamfile
        # Generate the mv command.

        cmds[merge_cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Rename BAM file",
            'outfile': merged_bamfile,
            'logfile': "",
            'start': False
        }
        # Add the mv command to the global commands dict.

        if globs['dryrun']:
            PC.report_step(globs, cmds, merge_cmd, "DRYRUN")
        else:
            PC.report_step(globs, cmds, merge_cmd, "EXECUTING")
            os.system(merge_cmd)
            if os.path.isfile(merged_bamfile):
                PC.report_step(globs, cmds, merge_cmd, "SUCCESS")
            else:
                PC.report_step(globs, cmds, merge_cmd, "ERROR")
                PC.errorOut("PIMAP1", "Error renaming BAM file.", globs)
        # Run the command and check for errors.

    return merged_bamfile, cmds
Exemplo n.º 11
0
def hisat2(globs, cmds, cur_ref):
    # Map a set of reads with BWA mem.

    hisat2_cmds, bamfiles = {}, []
    for lib_type in globs['libs']:
        # Generate a hisat2 command for each input fastq type.

        cur_logfile = os.path.join(
            globs['iterlogdir'],
            "hisat2-" + lib_type + "-iter-" + globs['iter-str'] + ".log")
        bamfile = os.path.join(
            globs['iterbamdir'],
            lib_type + "-iter-" + globs['iter-str'] + ".bam.gz")
        bamfiles.append(bamfile)
        # Get the bam file and log file for the current fastq file.

        rg_fields = ["ID", "PL", "PU", "LB", "SM"]
        # The read group fields to add to the output bam.

        hisat2_cmd = globs['mapper-path']
        for field in rg_fields:
            hisat2_cmd += " --rg " + field + ":" + globs['rg'][field]
        hisat2_cmd += " -p " + str(globs['map-t'])
        hisat2_cmd += " -x " + cur_ref
        if lib_type == 'pe':
            hisat2_cmd += " -1 " + globs['libs'][lib_type].split(" ")[0]
            hisat2_cmd += " -2 " + globs['libs'][lib_type].split(" ")[1]
        else:
            hisat2_cmd += " -U " + globs['libs'][lib_type]
        hisat2_cmd += " | " + globs['samtools-path'] + " sort"
        hisat2_cmd += " | " + globs['samtools-path'] + " view -bh -"
        hisat2_cmd += " > " + bamfile
        # Generate the hisat2 command, including adding read group info with --rg, and passing output to samtools for sorting
        # converting to .bam.

        cmd_num = PC.getCMDNum(globs, len(cmds))
        # Get the current command number for the log.

        cmds[hisat2_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'outfile': bamfile,
            'logfile': cur_logfile,
            'start': False
        }
        # Save the hisat command to the global cmds dict.

        hisat2_cmds[hisat2_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'logfile': cur_logfile,
            'start': False
        }
        # Save the hisat2 command to the bwa_cmds dict.
    # Prepare the hisat2 commands for each fastq type

    pool = mp.Pool(processes=globs['map-procs'])
    for result in pool.starmap(PC.runCMD, ((hisat2_cmd, globs, cmds, True)
                                           for hisat2_cmd in hisat2_cmds)):
        if result:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()
    # Run the hisat2 commands across multiple processors, if specified
    # End the program if an error is encountered

    return bamfiles, cmds
Exemplo n.º 12
0
def genotypeGVCFs(globs, cmds, cur_ref):
    # Genotype the GVCFs from the last iteration by scaffold.

    gatk_cmds = {}
    for scaff in globs['scaffolds']:
        cur_logfile = os.path.join(
            globs['itervcflogdir'], "gatk-genotypegvcfs- " + scaff + "-iter-" +
            globs['iter-str'] + ".log")
        gvcf_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz")
        vcf_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")

        gatk_cmd = globs[
            'gatk-path'] + " GenotypeGVCFs -R " + cur_ref + " -V " + gvcf_file + " -O " + vcf_file + " --include-non-variant-sites"

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Genotype gVCF " + scaff,
            'outfile': vcf_file,
            'logfile': cur_logfile,
            'start': False
        }
        gatk_cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Genotype gVCF " + scaff,
            'outfile': vcf_file,
            'logfile': cur_logfile,
            'start': False
        }

    pool = mp.Pool(processes=globs['gvcf-procs'])
    for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True)
                                              for gatk_cmd in gatk_cmds)):
        if exit_flag:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()

    return cmds


# #############################################################################

# def gatherVcfs(vcfdir, cur_ref, globs):
# # Combine the region VCFs from haplotypeCallerMulti.
#     cur_logfile = os.path.join(globs['logdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log");
#     infile_ext = ".vcf.gz";
#     outfile_ext = ".vcf.gz";
#     if globs['iteration'] == globs['num-iters']:
#         infile_ext = "-filtered.vcf.gz";
#         outfile_ext = "-filtered-final.vcf.gz";
#         cur_logfile = cur_logfile.replace(".log", "-final.log");
#     vcf_file = os.path.join(globs['itervcfdir'], "iter-" + globs['iter-str'] + outfile_ext);

#     run_flag = PC.runCheck([vcffile], cur_logfile, globs);

#     if run_flag:
#         params_file = os.path.join(globs['iterdir'], "iter-" + globs['iter-str'] + "-gathervcfs-params.txt");
#         with open(params_file, "w") as paramsfile:
#             for scaff in globs['scaffolds']:
#                 scaff_vcf = os.path.join(vcfdir, scaff + "-iter-" + globs['iter-str'] + infile_ext);
#                 paramsfile.write("-I " + scaff_vcf + "\n");
#         gatk_cmd = globs['gatk-path'] + " GatherVcfs --arguments_file " + params_file + " -O " + vcffile;
#         exit_flag = PC.runCMD(gatk_cmd, "GATK GatherVcfs", cur_logfile, True, globs);
#     else:
#         PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF file already exists", globs['pad'], sep=".") + vcffile + "\n");
#         exit_flag = False;

#     return vcffile, exit_flag;

# #############################################################################

# def indexVCF(vcffile, globs, suffix=""):
# # Index the combined VCF from gatherVcfs.
#     if suffix != "":
#         suffix = "-" + suffix;
#     cur_logfile = os.path.join(globs['iterlogdir'], "vcf-index-iter-" + globs['iter-str'] + suffix + ".log");
#     if globs['iteration'] == globs['num-iters']:
#         cur_logfile = cur_logfile.replace(".log", "-final.log");
#     index_file = vcffile + ".tbi";

#     run_flag = PC.runCheck([index_file], cur_logfile, globs);

#     if run_flag:
#         index_cmd = "tabix -fp vcf " + vcffile;
#         exit_flag = PC.runCMD(index_cmd, "tabix", cur_logfile, True, globs);
#     else:
#         PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF index file already exists", globs['pad'], sep=".") + vcffile + "\n");
#         exit_flag = False;

#     return exit_flag;

# #############################################################################
Exemplo n.º 13
0
def genConsensus(globs, cmds, vcf_file, cur_ref):
    # Run the command to generate a consensus FASTA file from the reference and the variants.

    cmd = "getConsCase()"
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Determining case of first base",
        'outfile': "",
        'logfile': "",
        'start': False
    }

    bcftools_cmd = globs[
        'bcftools-path'] + " consensus -f " + cur_ref + " -o " + globs[
            'iter-final-fa']
    if globs['last-iter'] and globs['indels']:
        bcftools_cmd += " -c " + globs['iter-final-chain']
    if globs['last-iter'] and globs['diploid']:
        bcftools_cmd += " -I "
    bcftools_cmd += " -e \"FILTER='pseudoit' || FILTER='IndelGap'\" " + vcf_file
    cmds[bcftools_cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Generating consensus",
        'outfile': globs['iter-final-fa'],
        'logfile': globs['iter-consensus-log'],
        'start': False
    }

    run_flag = True
    if globs['resume']:
        run_flag = PC.runCheck(bcftools_cmd, cmds, globs)

    #### RUN RUNCHECK FIRST

    first_lower = False
    if globs['dryrun']:
        PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        first_lower, linestr_orig, linestr_repl = True, "a", "A"
    elif run_flag:
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        first_lower, linestr_orig, linestr_repl = getConsCase(cur_ref)
        PC.report_step(globs, cmds, cmd, "SUCCESS",
                       "First base: " + linestr_orig[0])
    # This first_lower stuff is a hack to deal with bcftools consensus using the case of the first base in the reference fasta to inject variants.
    # Possibly resolved: https://github.com/samtools/bcftools/issues/1150#issuecomment-582407490
    # Need to test and make sure it is in official release before I remove this hack.

    if first_lower:
        cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + cur_ref
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Changing first ref base to upper case",
            'outfile': "",
            'logfile': "",
            'start': False
        }

        if globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        elif run_flag:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)
            PC.report_step(globs, cmds, cmd, "SUCCESS",
                           "First base converted to upper case")
    # Part of first_lower hack.

    exit_flag = PC.runCMD(bcftools_cmd, globs, cmds, True)
    # Consensus command
    PC.exitCheck(exit_flag, globs)
    # End the program if an error is encountered

    if first_lower:
        cmd = "sed -i '2 s/" + linestr_repl + "/" + linestr_orig + "/g' " + cur_ref
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Reverting case of first ref base",
            'outfile': "",
            'logfile': "",
            'start': False
        }

        if globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        elif run_flag:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)
            PC.report_step(globs, cmds, cmd, "SUCCESS",
                           "First base reverted to original case")

        if not globs['dryrun']:
            first_lower, linestr_orig, linestr_repl = getConsCase(
                globs['iter-final-fa'])

        cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + globs[
            'iter-final-fa']
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Reverting case of first consensus base",
            'outfile': "",
            'logfile': "",
            'start': False
        }

        if globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        elif run_flag:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)
            PC.report_step(globs, cmds, cmd, "SUCCESS",
                           "First base reverted to original case")
    # Part of first_lower hack.

    globs['consensus-file'] = globs['iter-final-fa']

    return cmds, globs
Exemplo n.º 14
0
def cleanUp(globs, cmds):
    i = globs['iter-str']
    prev_i = str(int(i) - 1)
    if len(prev_i) == 1:
        prev_iter = "0" + prev_i

    possible_map_files = {
        'iter-' + i + "-dupmets.txt": 2,
        "merged-iter-" + i + ".bam.gz": 2,
        "merged-rg-iter-" + i + ".bam.gz": 2,
        "merged-rg-mkdup-iter-" + i + ".bam.gz": 1,
        "merged-rg-mkdup-iter-" + i + ".bam.gz.bai": 1,
        "pe-iter-" + i + ".bam.gz": 2,
        "pem-iter-" + i + ".bam.gz": 2,
        "se-iter-" + i + ".bam.gz": 2
    }

    possible_vcf_files = {
        "vcf-scaff": 2,
        "gvcf-scaff": 2,
        "iter-" + i + "-filter-intermediate.vcf.gz": 2,
        "iter-" + i + "-filter-intermediate.vcf.gz.tbi": 2,
        "iter-" + i + "-filter-intermediate-snps.vcf.gz": 1,
        "iter-" + i + "-filter-intermediate-snps.vcf.gz.tbi": 1,
        "iter-" + i + "-gathervcfs-params.txt": 2,
        "iter-" + i + "-filter.vcf.gz": 1,
        "iter-" + i + "-filter.vcf.gz.tbi": 1,
        "iter-" + i + "-filter-snps.vcf.gz": 1,
        "iter-" + i + "-filter-snps.vcf.gz.tbi": 1
    }

    possible_fa_files = [
        "iter-" + prev_i + "-masked.fa", "iter-" + prev_i + "snps-masked.fa",
        "iter-" + i + "-snps-intermediate.dict",
        "iter-" + i + "-snps-intermediate.fa",
        "iter-" + i + "-snps-intermediate.fa.amb",
        "iter-" + i + "-snps-intermediate.fa.ann",
        "iter-" + i + "-snps-intermediate.fa.bwt",
        "iter-" + i + "-snps-intermediate.fa.fai",
        "iter-" + i + "-snps-intermediate.fa.pac",
        "iter-" + i + "-snps-intermediate.fa.sa"
    ]

    if globs['last-iter'] and globs['keeplevel'] == 0:
        globs['keeplevel'] = 1

    for f in possible_map_files:
        if possible_map_files[f] > globs['keeplevel']:
            full_f = os.path.join(globs['iterbamdir'], f)
            if os.path.isfile(full_f):
                cmd = "os.remove(" + full_f + ")"
                cmds[cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Removing file",
                    'outfile': "",
                    'logfile': "",
                    'start': False
                }
                if globs['dryrun']:
                    PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
                else:
                    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
                    os.remove(full_f)

    for f in possible_vcf_files:
        if possible_vcf_files[f] > globs['keeplevel']:
            full_f = os.path.join(globs['itervcfdir'], f)

            if os.path.isfile(full_f):
                cmd = "os.remove(" + full_f + ")"
                cmds[cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Removing file",
                    'outfile': "",
                    'logfile': "",
                    'start': False
                }
                if globs['dryrun']:
                    PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
                else:
                    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
                    os.remove(full_f)
            elif os.path.isdir(full_f):
                cmd = "shutil.rmtree(" + full_f + ")"
                cmds[cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Removing directory",
                    'outfile': "",
                    'logfile': "",
                    'start': False
                }
                if globs['dryrun']:
                    PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
                else:
                    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
                shutil.rmtree(full_f)

    return cmds
Exemplo n.º 15
0
def indexCheck(cur_fa, globs, cmds):
    # Checks that the user has created the proper index files before running the program.

    ref_ext = PC.detectRefExt(cur_fa, globs)

    dictfile = cur_fa.replace(ref_ext, ".dict")
    cmd = "os.path.isfile(" + dictfile + ")"
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Checking ref indices",
        'outfile': "",
        'logfile': "",
        'start': False
    }
    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
    if not os.path.isfile(dictfile):
        PC.errorOut(
            "REF1",
            "Reference dictionary not found. Please run: picard CreateSequenceDictionary R=<ref>.fa O=<ref>.dict",
            globs)
    PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found", "")
    # Check for the reference dictionary file.

    faidxfile = cur_fa + ".fai"
    cmd = "os.path.isfile(" + faidxfile + ")"
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Checking ref indices",
        'outfile': "",
        'logfile': "",
        'start': False
    }
    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
    if not os.path.isfile(faidxfile):
        PC.errorOut(
            "REF2",
            "Reference index (samtools) not found. Please run: samtools faidx <ref>.fa",
            globs)
    PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found")
    # Check for the reference faidx file.

    if globs['mapper'] == "bwa":
        indexfiles = [
            cur_fa + ".amb", cur_fa + ".ann", cur_fa + ".bwt", cur_fa + ".pac",
            cur_fa + ".sa"
        ]
        cmd = "os.path.isfile(" + ",".join(indexfiles) + ")"
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Checking ref indices",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        if any(not os.path.isfile(f) for f in indexfiles):
            PC.errorOut(
                "REF3",
                "Reference index (bwa) not found. Please run: bwa index <ref>.fa",
                globs)
        PC.report_step(globs, cmds, cmd, "SUCCESS", "index files found")
    # Check for the bwa index files if --mapper is bwa.

    elif globs['mapper'] == "hisat2":
        indexfile = cur_fa + ".1.ht2"
        cmd = "os.path.isfile(" + indexfile + ")"
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Checking ref indices",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        if not os.path.isfile(indexfile):
            PC.errorOut(
                "REF3",
                "Reference index (hisat2) not found. Please run: hisat2-build <ref>.fa <ref>.fa",
                globs)
        PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found")
    # Check for the hisat2 index files if --mapper is hisat2.

    return cmds
Exemplo n.º 16
0
def varFilter(globs, cmds, cur_ref):
    # Run the command to filter variants from a VCF file based on input filters. Default: "MQ < 30.0 || DP < 5 || DP > 60"

    bcftools_cmds = {}
    for scaff in globs['scaffolds']:
        # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']):
        #     cur_logfile = os.path.join(globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + "-snps.log");
        #     vcf_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps.vcf.gz");
        #     filter_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps-filter.vcf.gz");
        # else:
        cur_logfile = os.path.join(
            globs['itervcflogdir'],
            "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + ".log")
        vcf_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")
        filter_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz")

        bcftools_cmd = globs['bcftools-path'] + " filter -m+ -e " + globs[
            'filter'] + " -s pseudoit --IndelGap 5 -Oz -o " + filter_file + " " + vcf_file

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[bcftools_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Filter VCF " + scaff,
            'outfile': filter_file,
            'logfile': cur_logfile,
            'start': False,
            "vcffile": vcf_file
        }
        bcftools_cmds[bcftools_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Filter VCF " + scaff,
            'outfile': filter_file,
            'logfile': cur_logfile,
            'start': False,
            "vcffile": vcf_file
        }

    if globs['dryrun']:
        cmd_num = PC.getCMDNum(globs, len(cmds))
        bcftools_skeleton_cmd = globs[
            'bcftools-path'] + " filter -m+ -e " + globs[
                'filter'] + " -s pseudoit --IndelGap 5 -Oz -o <filtered vcf> <input vcf>"
        cmds[bcftools_skeleton_cmd] = {
            'cmd-num': cmd_num,
            'desc':
            str(globs['num-procs']) + " bcftools filter procs in parallel",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, bcftools_skeleton_cmd, "DRYRUN",
                       bcftools_skeleton_cmd)

    else:
        pool = mp.Pool(processes=globs['filter-procs'])
        for result in pool.starmap(PC.runCMD,
                                   ((bcftools_cmd, globs, cmds, True)
                                    for bcftools_cmd in bcftools_cmds)):
            if result:
                pool.terminate()
                globs['exit-code'] = 1
                PC.endProg(globs)
        pool.terminate()

    return cmds
Exemplo n.º 17
0
def haplotypeCaller(globs, cmds, cur_ref, dup_bamfile):
    # Run HaplotypeCaller for each scaffold.

    gatk_cmds = {}
    for scaff in globs['scaffolds']:
        cur_logfile = os.path.join(
            globs['itervcflogdir'], "gatk-haplotypcaller-" + scaff + "-iter-" +
            globs['iter-str'] + ".log")
        if globs['last-iter']:
            vcffile = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz")
        else:
            vcffile = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")

        gatk_cmd = globs[
            'gatk-path'] + " HaplotypeCaller -R " + cur_ref + " -I " + dup_bamfile + " -L \"" + scaff + "\" -stand-call-conf 30 --native-pair-hmm-threads " + str(
                globs['gatk-t'])
        if globs['last-iter']:
            gatk_cmd += " -ERC GVCF"
        # The final iteration outputs GVCFs to properly emit all sites
        gatk_cmd += " -O " + vcffile

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "HaplotypeCaller " + scaff,
            'outfile': vcffile,
            'logfile': cur_logfile,
            'start': False
        }
        gatk_cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "HaplotypeCaller " + scaff,
            'outfile': vcffile,
            'logfile': cur_logfile,
            'start': False
        }

    if globs['dryrun']:
        cmd_num = PC.getCMDNum(globs, len(cmds))
        gatk_skeleton_cmd = globs[
            'gatk-path'] + " HaplotypeCaller -R <reference fasta> -I <BAM file> -L \"<scaffold>\" -stand-call-conf 30 --native-pair-hmm-threads " + str(
                globs['gatk-t'])
        if globs['last-iter']:
            gatk_skeleton_cmd += " -ERC GVCF"
        # The final iteration outputs GVCFs to properly emit all sites
        gatk_skeleton_cmd += " -O <vcf file>"
        cmds[gatk_skeleton_cmd] = {
            'cmd-num': cmd_num,
            'desc':
            str(globs['gatk-procs']) + " HaplotypeCaller procs in parallel",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, gatk_skeleton_cmd, "DRYRUN",
                       gatk_skeleton_cmd)

    else:
        pool = mp.Pool(processes=globs['gatk-procs'])
        for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True)
                                                  for gatk_cmd in gatk_cmds)):
            if exit_flag:
                pool.terminate()
                globs['exit-code'] = 1
                PC.endProg(globs)
        pool.terminate()

    return cmds