Пример #1
0
def pe_align(log, sample, sample_dir, ref, cores, r1, r2):
    r1sai = create_sai(log, sample, sample_dir, ref, cores, r1, 1)
    r2sai = create_sai(log, sample, sample_dir, ref, cores, r2, 2)
    cmd1 = [
        get_user_path("bwa", "bwa"),
        "sampe",
        "-a",
        "700",
        ref,
        r1sai,
        r2sai,
        r1.pth,
        r2.pth
    ]
    cmd2 = [
        get_user_path("samtools", "samtools"),
        "view",
        "-bS",
        "-"
    ]
    sampe_out_fname = os.path.join(sample_dir, '{}.pe.bwa-sampe-out.log'.format(sample))
    samtools_out_fname = os.path.join(sample_dir, '{}.pe.samtools-out.log'.format(sample))
    bam_out_fname = os.path.join(sample_dir, '{}.bam'.format(sample))
    log.info("Building BAM for {}".format(sample))
    with open(sampe_out_fname, 'w') as sampe_out:
        with open(samtools_out_fname, 'w') as samtools_out:
            with open(bam_out_fname, 'w') as bam_out:
                proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=sampe_out)
                proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=bam_out, stderr=samtools_out)
                proc1.stdout.close()
                proc2.communicate()
    # remove the sai files (they'll be stale soon)
    os.remove(r1sai)
    os.remove(r2sai)
    return bam_out_fname
Пример #2
0
def phase(log, sample, sample_dir, reference, bam):
    log.info("Phasing BAM file with CALMD for {}".format(sample))
    cmd1 = [
        get_user_path("binaries", "samtools"),
        "calmd",
        "-A",
        "-E",
        "-u",
        "-r",
        bam,
        reference
    ]
    cmd2 = [
        get_user_path("binaries", "samtools"),
        "phase",
        "-A",
        "-F",
        "-Q",
        "20",
        "-b",
        sample_dir,
        "-"
    ]
    samtools_calmd_out_fname = '{}.samtools-calmd-phase-out.log'.format(sample_dir)
    samtools_phase_out_fname = '{}.samtools-phase-out.log'.format(sample_dir)
    with open(samtools_calmd_out_fname, 'w') as samtools_calmd_out:
        with open(samtools_phase_out_fname, 'w') as samtools_phase_out:
            proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=samtools_calmd_out)
            proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=samtools_phase_out, stderr=subprocess.STDOUT)
            proc1.stdout.close()
            proc2.communicate()
    return "{}.0.bam".format(sample_dir), "{}.1.bam".format(sample_dir)
Пример #3
0
def phase(log, sample, sample_dir, reference, bam):
    log.info("Phasing BAM file with CALMD for {}".format(sample))
    cmd1 = [
        get_user_path("binaries", "samtools"), "calmd", "-A", "-E", "-u", "-r",
        bam, reference
    ]
    cmd2 = [
        get_user_path("binaries", "samtools"), "phase", "-A", "-F", "-Q", "20",
        "-b", sample_dir, "-"
    ]
    samtools_calmd_out_fname = '{}.samtools-calmd-phase-out.log'.format(
        sample_dir)
    samtools_phase_out_fname = '{}.samtools-phase-out.log'.format(sample_dir)
    with open(samtools_calmd_out_fname, 'w') as samtools_calmd_out:
        with open(samtools_phase_out_fname, 'w') as samtools_phase_out:
            proc1 = subprocess.Popen(cmd1,
                                     stdout=subprocess.PIPE,
                                     stderr=samtools_calmd_out)
            proc2 = subprocess.Popen(cmd2,
                                     stdin=proc1.stdout,
                                     stdout=samtools_phase_out,
                                     stderr=subprocess.STDOUT)
            proc1.stdout.close()
            proc2.communicate()
    return "{}.0.bam".format(sample_dir), "{}.1.bam".format(sample_dir)
Пример #4
0
def mem_pe_align(log, sample, sample_dir, ref, cores, r1, r2):
    #pdb.set_trace()
    cmd1 = [
        get_user_path("bwa", "bwa"),
        "mem",
        "-t",
        str(cores),
        "-M",
        ref,
        r1.pth,
        r2.pth
    ]
    cmd2 = [
        get_user_path("samtools", "samtools"),
        "view",
        "-bS",
        "-"
    ]
    sampe_out_fname = os.path.join(sample_dir, '{}.pe.bwa-sampe-out.log'.format(sample))
    samtools_out_fname = os.path.join(sample_dir, '{}.pe.samtools-view-out.log'.format(sample))
    bam_out_fname = os.path.join(sample_dir, '{}.bam'.format(sample))
    log.info("Building BAM for {}".format(sample))
    with open(sampe_out_fname, 'w') as sampe_out:
        with open(samtools_out_fname, 'w') as samtools_out:
            with open(bam_out_fname, 'w') as bam_out:
                proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=sampe_out)
                proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=bam_out, stderr=samtools_out)
                proc1.stdout.close()
                proc2.communicate()
    return bam_out_fname
Пример #5
0
 def __init__(self,
              target,
              query,
              coverage,
              identity,
              out=False,
              min_match=None):
     # if not an output file, create a temp file to hold output
     if not out:
         fd, self.output = tempfile.mkstemp(suffix='.lastz')
         os.close(fd)
     else:
         self.output = out
     if identity and not min_match:
         self.cli = '{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\
             --strand=both \
             --seed=12of19 \
             --transition \
             --nogfextend \
             --nochain \
             --gap=400,30 \
             --xdrop=910 \
             --ydrop=8370 \
             --hspthresh=3000 \
             --gappedthresh=3000 \
             --noentropy \
             --coverage={2} \
             --identity={3} \
             --output={4} \
             --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity'.format(
             target, query, coverage, identity, self.output,
             get_user_path("lastz", "lastz"))
     elif min_match:
         self.cli = '{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\
             --strand=both \
             --seed=12of19 \
             --transition \
             --nogfextend \
             --nochain \
             --gap=400,30 \
             --xdrop=910 \
             --ydrop=8370 \
             --hspthresh=3000 \
             --gappedthresh=3000 \
             --noentropy \
             --matchcount={2} \
             --identity={3} \
             --output={4} \
             --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity'.format(
             target, query, min_match, identity, self.output,
             get_user_path("lastz", "lastz"))
Пример #6
0
def coverage(log, sample, assembly_pth, assembly, cores, bam):
    log.info("Computing coverage with GATK for {}".format(sample))
    cwd = os.getcwd()
    # move into reference directory
    os.chdir(assembly_pth)
    cmd = [
        get_user_path("binaries", "gatk"),
        "-T",
        "DepthOfCoverage",
        "-R",
        assembly,
        "-I",
        bam,
        "-o",
        "{}-coverage".format(sample),
        "-nt",
        str(cores),
        "--omitIntervalStatistics",
        "--omitLocusTable"
    ]
    gatk_coverage_fname = os.path.join(assembly_pth, '{}.GATK-coverage-out.log'.format(sample))
    with open(gatk_coverage_fname, 'w') as gatk_out:
        proc = subprocess.Popen(cmd, stdout=gatk_out, stderr=subprocess.STDOUT)
        proc.communicate()
    os.chdir(cwd)
    return os.path.join(assembly_pth, "{}-coverage".format(sample))
Пример #7
0
 def test_config_directories_exist(self):
     for directory in self.directories:
         param = get_user_path(directory[0],
                               directory[1],
                               package_only=True)
         self.assertTrue(os.path.isdir(param),
                         "Directory {} is missing".format(param))
Пример #8
0
 def test_binaries_exist(self):
     """Test that binaries in config are properly located"""
     for program in self.binaries:
         binary = get_user_path(program[0], program[1], package_only=True)
         self.assertTrue(
             os.path.isfile(binary) and os.access(binary, os.X_OK),
             "Binary {} is missing".format(binary))
Пример #9
0
 def test_binaries_exist(self):
     """Test that binaries in config are properly located"""
     for program in self.binaries:
         binary = get_user_path(program[0], program[1], package_only=True)
         self.assertTrue(os.path.isfile(binary) and os.access(binary, os.X_OK),
             "Binary {} is missing".format(binary)
         )
Пример #10
0
 def test_config_directories_exist(self):
     for directory in self.directories:
         param = get_user_path(directory[0], directory[1], package_only=True)
         self.assertTrue(
             os.path.isdir(param),
             "Directory {} is missing".format(param)
         )
Пример #11
0
 def test_config_binaries(self):
     """Test that config is properly located"""
     for program in self.binaries:
         binary = get_user_path(program[0], program[1], package_only=True)
         expected = os.path.join(sys.prefix, "bin", program[2])
         self.assertEqual(
             binary, expected,
             "Config entry {} != {} (expected)".format(binary, expected))
Пример #12
0
 def test_config_directories(self):
     for directory in self.directories:
         param = get_user_path(directory[0],
                               directory[1],
                               package_only=True)
         expected = os.path.join(sys.prefix, directory[1])
         self.assertEqual(param, expected,
                          "Directory {} is missing".format(directory[1]))
Пример #13
0
 def __init__(self, target, query, coverage, identity, out=False, min_match=None):
     # if not an output file, create a temp file to hold output
     if not out:
         fd, self.output = tempfile.mkstemp(suffix=".lastz")
         os.close(fd)
     else:
         self.output = out
     if identity and not min_match:
         self.cli = "{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\
             --strand=both \
             --seed=12of19 \
             --transition \
             --nogfextend \
             --nochain \
             --gap=400,30 \
             --xdrop=910 \
             --ydrop=8370 \
             --hspthresh=3000 \
             --gappedthresh=3000 \
             --noentropy \
             --coverage={2} \
             --identity={3} \
             --output={4} \
             --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity".format(
             target, query, coverage, identity, self.output, get_user_path("lastz", "lastz")
         )
     elif min_match:
         self.cli = "{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\
             --strand=both \
             --seed=12of19 \
             --transition \
             --nogfextend \
             --nochain \
             --gap=400,30 \
             --xdrop=910 \
             --ydrop=8370 \
             --hspthresh=3000 \
             --gappedthresh=3000 \
             --noentropy \
             --matchcount={2} \
             --identity={3} \
             --output={4} \
             --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity".format(
             target, query, min_match, identity, self.output, get_user_path("lastz", "lastz")
         )
Пример #14
0
 def test_config_directories(self):
     for directory in self.directories:
         param = get_user_path(directory[0], directory[1], package_only=True)
         expected = os.path.join(sys.prefix, directory[1])
         self.assertEqual(
             param,
             expected,
             "Directory {} is missing".format(directory[1])
         )
Пример #15
0
 def test_config_binaries(self):
     """Test that config is properly located"""
     for program in self.binaries:
         binary = get_user_path(program[0], program[1], package_only=True)
         expected = os.path.join(sys.prefix, "bin", program[2])
         self.assertEqual(
             binary,
             expected,
             "Config entry {} != {} (expected)".format(binary, expected)
         )
Пример #16
0
def index(log, sample, sample_dir, bam):
    log.info("Indexing BAM for {}".format(sample))
    cmd = [get_user_path("samtools", "samtools"), "index", bam]
    samtools_out_fname = os.path.join(
        sample_dir, '{}.samtools-index-out.log'.format(sample))
    with open(samtools_out_fname, 'w') as samtools_out:
        proc = subprocess.Popen(cmd,
                                stdout=samtools_out,
                                stderr=subprocess.STDOUT)
        proc.communicate()
Пример #17
0
def create_faidx(log, sample, sample_dir, fasta):
    log.info("Indexing fasta for {}".format(sample))
    cmd = [get_user_path("samtools", "samtools"), "faidx", fasta]
    samtools_out_fname = os.path.join(
        sample_dir, '{}.samtools-faidx-out.log'.format(sample))
    with open(samtools_out_fname, 'w') as samtools_out:
        proc = subprocess.Popen(cmd,
                                stdout=samtools_out,
                                stderr=subprocess.STDOUT)
        proc.communicate()
Пример #18
0
def create_faidx(log, sample, sample_dir, fasta):
    log.info("Indexing fasta for {}".format(sample))
    cmd = [
        get_user_path("samtools", "samtools"),
        "faidx",
        fasta
    ]
    samtools_out_fname = os.path.join(sample_dir, '{}.samtools-faidx-out.log'.format(sample))
    with open(samtools_out_fname, 'w') as samtools_out:
        proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT)
        proc.communicate()
Пример #19
0
def create_index_files(log, reference):
    log.info("Running bwa indexing against {}".format(reference))
    cwd = os.getcwd()
    # move into reference directory
    os.chdir(os.path.dirname(reference))
    cmd = [get_user_path("bwa", "bwa"), "index", reference]
    with open('bwa-index-file.log', 'a') as outf:
        proc = subprocess.Popen(cmd, stdout=outf, stderr=subprocess.STDOUT)
        proc.communicate()
    # mvoe back to working directory
    os.chdir(cwd)
Пример #20
0
 def __init__(self, target, query, out=False):
     # if not an output file, create a temp file to hold output
     if not out:
         fd, self.output = tempfile.mkstemp(suffix='.lastz')
         os.close(fd)
     else:
         self.output = out
     self.cli = '{3} {0}[multiple,nameparse=full] {1}[nameparse=full]\
             --output={2} \
             --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity'.format(
         target, query, self.output, get_user_path("lastz", "lastz"))
Пример #21
0
def index(log, sample, sample_dir, bam):
    log.info("Indexing BAM for {}".format(sample))
    cmd = [
        get_user_path("samtools", "samtools"),
        "index",
        bam
    ]
    samtools_out_fname = os.path.join(sample_dir, '{}.samtools-index-out.log'.format(sample))
    with open(samtools_out_fname, 'w') as samtools_out:
        proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT)
        proc.communicate()
Пример #22
0
def call(log, sample, sample_dir, reference, bam, phase=None):
    if phase is None:
        log.info("Creating REF/ALT allele FASTQ file --Unphased--")
    else:
        log.info("Creating REF/ALT allele FASTQ file {}".format(phase))
    cmd1 = [
        get_user_path("samtools", "samtools"),
        "mpileup",
        "-u",
        "-f",
        reference,
        bam
    ]
    cmd2 = [
        get_user_path("samtools", "bcftools"),
        "view",
        "-cg",
        "-"
    ]
    cmd3 = [
        get_user_path("samtools", "vcfutils"),
        "vcf2fq"
    ]
    mpileup_out_fname = "{}.samtools-mpileup-out.log".format(sample_dir)
    bcftools_out_fname = "{}.samtools-bcftools-out.log".format(sample_dir)
    vcfutils_out_fname = "{}.samtools-vcfutils-out.log".format(sample_dir)
    if phase is None:
        vcfutils_fastq_fname = "{}.fq".format(sample_dir)
    else:
        vcfutils_fastq_fname = "{}.{}.fq".format(sample_dir, phase)
    with open(mpileup_out_fname, 'w') as mpileup_out:
        with open(bcftools_out_fname, 'w') as bcftools_out:
            with open(vcfutils_out_fname, 'w') as vcfutils_out:
                with open(vcfutils_fastq_fname, 'w') as vcfutils_fastq:
                    proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=mpileup_out)
                    proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=subprocess.PIPE, stderr=bcftools_out)
                    proc3 = subprocess.Popen(cmd3, stdin=proc2.stdout, stdout=vcfutils_fastq, stderr=vcfutils_out)
                    proc1.stdout.close()
                    proc2.stdout.close()
                    proc3.communicate()
    return vcfutils_fastq_fname
Пример #23
0
 def __init__(self, target, query, out=False):
     # if not an output file, create a temp file to hold output
     if not out:
         fd, self.output = tempfile.mkstemp(suffix=".lastz")
         os.close(fd)
     else:
         self.output = out
     self.cli = "{3} {0}[multiple,nameparse=full] {1}[nameparse=full]\
             --output={2} \
             --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity".format(
         target, query, self.output, get_user_path("lastz", "lastz")
     )
Пример #24
0
def lastz_params(target, query, coverage, identity, outfile):
    output_format = "general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity,coverage"
    cmd = [
        get_user_path("lastz", "lastz"), "{0}[multiple]".format(target),
        "{0}[nameparse=full]".format(query), "--strand=both", "--seed=12of19",
        "--transition", "--nogfextend", "--nochain", "--gap=400,30",
        "--xdrop=910", "--ydrop=8370", "--hspthresh=3000",
        "--gappedthresh=3000", "--noentropy",
        "--coverage={0}".format(coverage), "--identity={0}".format(identity),
        "--output={0}".format(outfile), "--format={0}".format(output_format)
    ]
    return cmd
Пример #25
0
def create_reference_dict(log, sample, sample_dir, reference):
    log.info("Creating FASTA dict for {}".format(sample))
    outf = os.path.splitext(reference)[0] + ".dict"
    cmd = [
        get_user_path("binaries", "picard"),
        "CreateSequenceDictionary",
        "R={}".format(reference),
        "O={}".format(outf)
    ]
    picard_ref_dict_fname = os.path.join(sample_dir, '{}.picard-reference-dict-out.log'.format(sample))
    with open(picard_ref_dict_fname, 'w') as picard_out:
        proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT)
        proc.communicate()
Пример #26
0
def se_align(log, sample, sample_dir, ref, cores, rS):
    bam_out_fname = os.path.join(sample_dir, '{}-se.bam'.format(sample))
    cmd = [
        get_user_path("ngm", "ngm"), "-r", ref, "-q", rS.pth, "-b", "-o",
        bam_out_fname, "-t",
        str(cores), "--no-progress"
    ]
    ngmse_out_fname = os.path.join(sample_dir, '{}.ngm.se.log'.format(sample))
    log.info("Building BAM for {}".format(sample))
    with open(ngmse_out_fname, 'w') as outf:
        proc = subprocess.Popen(cmd, stdout=outf, stderr=subprocess.STDOUT)
        proc.communicate()
    return bam_out_fname
Пример #27
0
def sort(log, sample, sample_dir, bam):
    log.info("Sorting BAM for {}".format(sample))
    out_prefix = "{}.sorted.bam".format(os.path.splitext(bam)[0])
    cmd = [
        get_user_path("binaries", "samtools"), "sort", bam, "-o", out_prefix
    ]
    samtools_out_fname = '{}.samtools-sort-out.log'.format(sample_dir)
    with open(samtools_out_fname, 'a') as samtools_out:
        proc = subprocess.Popen(cmd,
                                stdout=samtools_out,
                                stderr=subprocess.STDOUT)
        proc.communicate()
    return out_prefix
Пример #28
0
def sort(log, sample, sample_dir, bam):
    log.info("Sorting BAM for {}".format(sample))
    out_prefix = "{}.sorted".format(os.path.splitext(bam)[0])
    cmd = [
        get_user_path("samtools", "samtools"),
        "sort",
        bam,
        out_prefix
    ]
    samtools_out_fname = '{}.samtools-sort-out.log'.format(sample_dir)
    with open(samtools_out_fname, 'a') as samtools_out:
        proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT)
        proc.communicate()
    return "{}.bam".format(out_prefix)
Пример #29
0
def clean_up_bam(log, sample, sample_dir, bam, type):
    log.info("Cleaning BAM for {}".format(sample))
    new_bam = new_bam_name(bam, "CL")
    cmd = [
        get_user_path("binaries", "picard"),
        "CleanSam",
        "I={}".format(bam),
        "O={}".format(new_bam)
    ]
    picard_clean_out_fname = os.path.join(sample_dir, '{}.{}.picard-clean-out.log'.format(sample, type))
    with open(picard_clean_out_fname, 'w') as picard_out:
        proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT)
        proc.communicate()
    # remove old bam
    os.remove(bam)
    return new_bam
Пример #30
0
def fix_mate_information(log, sample, sample_dir, bam, type):
    log.info("Fixing mate information for {}".format(sample))
    new_bam = new_bam_name(bam, "CL")
    cmd = [
        get_user_path("binaries", "picard"),
        "FixMateInformation",
        "I={}".format(bam),
        "O={}".format(new_bam),
        "VALIDATION_STRINGENCY=SILENT"
    ]
    picard_clean_out_fname = os.path.join(sample_dir, '{}.{}.picard.fixmate.log'.format(sample, type))
    with open(picard_clean_out_fname, 'w') as picard_out:
        proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT)
        proc.communicate()
    # remove old bam
    os.remove(bam)
    return new_bam
Пример #31
0
def create_sai(log, sample, sample_dir, ref, cores, reads, read):
    log.info("Creating read index file for {}".format(reads.file))
    cmd = [
        get_user_path("bwa", "bwa"),
        "aln",
        "-t",
        str(cores),
        ref,
        reads.pth
    ]
    aln_out_fname = os.path.join(sample_dir, '{}-r{}.sai'.format(sample, read))
    aln_err_fname = os.path.join(sample_dir, '{}-r{}.bwa-aln-out.log'.format(sample, read))
    with open(aln_out_fname, 'w') as aln_out:
        with open(aln_err_fname, 'w') as aln_err:
            proc = subprocess.Popen(cmd, stdout=aln_out, stderr=aln_err)
            proc.communicate()
    return aln_out_fname
Пример #32
0
def coverage(log, sample, assembly_pth, assembly, cores, bam):
    log.info("Computing coverage with GATK for {}".format(sample))
    cwd = os.getcwd()
    # move into reference directory
    os.chdir(assembly_pth)
    cmd = [
        get_user_path("binaries", "gatk"), "-T", "DepthOfCoverage", "-R",
        assembly, "-I", bam, "-o", "{}-coverage".format(sample), "-nt",
        str(cores), "--omitIntervalStatistics", "--omitLocusTable"
    ]
    gatk_coverage_fname = os.path.join(
        assembly_pth, '{}.GATK-coverage-out.log'.format(sample))
    with open(gatk_coverage_fname, 'w') as gatk_out:
        proc = subprocess.Popen(cmd, stdout=gatk_out, stderr=subprocess.STDOUT)
        proc.communicate()
    os.chdir(cwd)
    return os.path.join(assembly_pth, "{}-coverage".format(sample))
Пример #33
0
 def run_alignment(self, clean=True):
     """ muscle """
     # create results file
     fd, aln = tempfile.mkstemp(suffix='.muscle')
     os.close(fd)
     # run MUSCLE on the temp file
     cmd = [get_user_path("binaries", "muscle"), "-in", self.input, "-out", aln]
     proc = subprocess.Popen(cmd,
             stderr=subprocess.PIPE,
             stdout=subprocess.PIPE
         )
     stdout, stderr = proc.communicate()
     self.alignment = AlignIO.read(open(aln, 'rU'), \
             "fasta", alphabet=Gapped(IUPAC.unambiguous_dna, "-"))
     # cleanup temp files
     if clean:
         self._clean(aln)
Пример #34
0
def calculate_hs_metrics(log, sample, sample_dir, reference, bam, target, bait):
    log.info("Calculating coverage metrics for {}".format(sample))
    hs_metrics_file = os.path.join(sample_dir, "{}.reads-on-target.txt".format(sample))
    cmd = [
        get_user_path("binaries", "picard"),
        "CollectHsMetrics",
        "I={}".format(bam),
        "O={}".format(hs_metrics_file),
        "REFERENCE_SEQUENCE={}".format(reference),
        "TARGET_INTERVALS={}".format(target),
        "BAIT_INTERVALS={}".format(bait),
        "VALIDATION_STRINGENCY=LENIENT"
    ]
    picard_hs_out_fname = os.path.join(sample_dir, '{}.picard-hs-metrics-out.log'.format(sample))
    with open(picard_hs_out_fname, 'w') as picard_out:
        proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT)
        proc.communicate()
    return hs_metrics_file
Пример #35
0
def phase(log, sample, sample_dir, bam):
    log.info("Phasing BAM file for {}".format(sample))
    cmd = [
        get_user_path("samtools", "samtools"),
        "phase",
        "-A",
        "-F",
        "-Q",
        "20",
        "-b",
        sample_dir,
        bam
    ]
    samtools_out_fname = '{}.samtools-phase-out.log'.format(sample_dir)
    with open(samtools_out_fname, 'w') as samtools_out:
        proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT)
        proc.communicate()
    return "{}.0.bam".format(sample_dir), "{}.1.bam".format(sample_dir)
Пример #36
0
def fq_to_fa(log, sample, sample_dir, fastq, phase=None):
    if phase is None:
        log.info("Converting --Unphased-- FASTQ files to FASTA files")
    else:
        log.info(
            "Creating REF/ALT allele FASTA file {0} from FASTQ {0}".format(
                phase))
    cmd = [get_user_path("seqtk", "seqtk"), "seq", "-a", fastq]
    seqtk_out_fname = "{}.seqtk-seq-out.log".format(sample_dir)
    if phase is None:
        seqtk_fasta_fname = "{}.fasta".format(sample_dir)
    else:
        seqtk_fasta_fname = "{}.{}.fasta".format(sample_dir, phase)
    with open(seqtk_out_fname, 'w') as seqtk_out:
        with open(seqtk_fasta_fname, 'w') as seqtk_fasta:
            proc = subprocess.Popen(cmd, stdout=seqtk_fasta, stderr=seqtk_out)
            proc.communicate()
    return seqtk_fasta_fname
Пример #37
0
 def run_alignment(self, clean=True):
     # create results file
     fd, aln = tempfile.mkstemp(suffix='.mafft')
     os.close(fd)
     aln_stdout = open(aln, 'w')
     # run MAFFT on the temp file
     cmd = [get_user_path("mafft", "mafft"), "--adjustdirection", "--maxiterate", "1000", self.input]
     # just pass all ENV params
     proc = subprocess.Popen(cmd,
             stderr=subprocess.PIPE,
             stdout=aln_stdout
         )
     stderr = proc.communicate()
     aln_stdout.close()
     self.alignment = AlignIO.read(open(aln, 'rU'), "fasta", \
             alphabet=Gapped(IUPAC.unambiguous_dna, "-"))
     if clean:
         self._clean(aln)
Пример #38
0
 def run_alignment(self, clean=True):
     # create results file
     fd, aln = tempfile.mkstemp(suffix='.mafft')
     os.close(fd)
     aln_stdout = open(aln, 'w')
     # run MAFFT on the temp file
     cmd = [
         get_user_path("mafft", "mafft"), "--adjustdirection",
         "--maxiterate", "1000", self.input
     ]
     # just pass all ENV params
     proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=aln_stdout)
     stderr = proc.communicate()
     aln_stdout.close()
     self.alignment = AlignIO.read(open(aln, 'rU'), "fasta", \
             alphabet=Gapped(IUPAC.unambiguous_dna, "-"))
     if clean:
         self._clean(aln)
Пример #39
0
def fq_to_fa(log, sample, sample_dir, fastq, phase=None):
    if phase is None:
        log.info("Converting --Unphased-- FASTQ files to FASTA files")
    else:
        log.info("Creating REF/ALT allele FASTA file {0} from FASTQ {0}".format(phase))
    cmd = [
        get_user_path("seqtk", "seqtk"),
        "seq",
        "-a",
        fastq
    ]
    seqtk_out_fname = "{}.seqtk-seq-out.log".format(sample_dir)
    if phase is None:
        seqtk_fasta_fname = "{}.fasta".format(sample_dir)
    else:
        seqtk_fasta_fname = "{}.{}.fasta".format(sample_dir, phase)
    with open(seqtk_out_fname, 'w') as seqtk_out:
        with open(seqtk_fasta_fname, 'w') as seqtk_fasta:
            proc = subprocess.Popen(cmd, stdout=seqtk_fasta, stderr=seqtk_out)
            proc.communicate()
    return seqtk_fasta_fname
Пример #40
0
def se_align(log, sample, sample_dir, ref, cores, rS):
    bam_out_fname = os.path.join(sample_dir, '{}-se.bam'.format(sample))
    cmd = [
        get_user_path("ngm", "ngm"),
        "-r",
        ref,
        "-q",
        rS.pth,
        "-b",
        "-o",
        bam_out_fname,
        "-t",
        str(cores),
        "--no-progress"
    ]
    ngmse_out_fname = os.path.join(sample_dir, '{}.ngm.se.log'.format(sample))
    log.info("Building BAM for {}".format(sample))
    with open(ngmse_out_fname, 'w') as outf:
        proc = subprocess.Popen(cmd, stdout=outf, stderr=subprocess.STDOUT)
        proc.communicate()
    return bam_out_fname
Пример #41
0
def merge_two_bams(log, sample, sample_dir, bam, bam_se):
    log.info("Merging BAMs for {}".format(sample))
    new_bam = new_bam_name(bam, "M")
    cmd = [
        get_user_path("binaries", "picard"),
        "MergeSamFiles",
        "SO=coordinate",
        "AS=true",
        "I={}".format(bam),
        "I={}".format(bam_se),
        "O={}".format(new_bam),
        "VALIDATION_STRINGENCY=LENIENT",
    ]
    picard_merge_out_fname = os.path.join(sample_dir, '{}.picard-merge-out.log'.format(sample))
    with open(picard_merge_out_fname, 'w') as picard_out:
        proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT)
        proc.communicate()
    # remove old bam
    os.remove(bam)
    os.remove(bam_se)
    return new_bam
Пример #42
0
def mark_duplicates(log, sample, sample_dir, bam, type):
    log.info("Marking read duplicates from BAM for {}".format(sample))
    new_bam = new_bam_name(bam, "MD")
    metricsfile = os.path.join(sample_dir, "{}.{}.picard-metricsfile.txt".format(sample, type))
    cmd = [
        get_user_path("binaries", "picard"),
        "MarkDuplicates",
        "I={}".format(bam),
        "O={}".format(new_bam),
        "METRICS_FILE={}".format(metricsfile),
        "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=250",
        "ASSUME_SORTED=true",
        "VALIDATION_STRINGENCY=SILENT",
        "REMOVE_DUPLICATES=false",
    ]
    picard_dd_out_fname = os.path.join(sample_dir, '{}.{}.picard-MD-out.log'.format(sample, type))
    with open(picard_dd_out_fname, 'w') as picard_out:
        proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT)
        proc.communicate()
    # remove old bam
    os.remove(bam)
    return new_bam
Пример #43
0
def lastz_params(target, query, coverage, identity, outfile):
    output_format = "general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity,coverage"
    cmd = [
            get_user_path("lastz", "lastz"),
            "{0}[multiple]".format(target),
            "{0}[nameparse=full]".format(query),
            "--strand=both",
            "--seed=12of19",
            "--transition",
            "--nogfextend",
            "--nochain",
            "--gap=400,30",
            "--xdrop=910",
            "--ydrop=8370",
            "--hspthresh=3000",
            "--gappedthresh=3000",
            "--noentropy",
            "--coverage={0}".format(coverage),
            "--identity={0}".format(identity),
            "--output={0}".format(outfile),
            "--format={0}".format(output_format)
        ]
    return cmd
Пример #44
0
def add_rg_header_info(log, sample, sample_dir, flowcell, bam, type):
    #pdb.set_trace()
    log.info("Adding RG header to BAM for {}".format(sample))
    new_bam = new_bam_name(bam, "RG")
    cmd = [
        get_user_path("binaries", "picard"),
        "AddOrReplaceReadGroups",
        "I={}".format(bam),
        "O={}".format(new_bam),
        "SORT_ORDER=coordinate",
        "RGPL=illumina",
        "RGPU={}".format(flowcell),
        "RGLB=Lib1",
        "RGID={}".format(sample),
        "RGSM={}".format(sample),
        "VALIDATION_STRINGENCY=LENIENT"
    ]
    picard_rg_out_fname = os.path.join(sample_dir, '{}.{}.picard-RG-out.log'.format(sample, type))
    with open(picard_rg_out_fname, 'w') as picard_out:
        proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT)
        proc.communicate()
    # remove old bam
    os.remove(bam)
    return new_bam
Пример #45
0
This code is distributed under a 3-clause BSD license. Please see
LICENSE.txt for more information.

Created on 26 June 2014 17:13 PDT (-0700)
"""


import os
import subprocess
from phyluce.pth import get_user_path, get_user_param


JAVA = get_user_param("java", "executable")
JAVA_PARAMS = get_user_param("java", "mem")
JAR_PATH = get_user_path("java", "jar")


def new_bam_name(bam, append):
    pth, bamfname = os.path.split(bam)
    bamfname = os.path.splitext(bamfname)[0]
    new_bamfname = "{}-{}.bam".format(bamfname, append)
    new_bam = os.path.join(pth, new_bamfname)
    return new_bam


def create_reference_dict(log, sample, sample_dir, reference):
    log.info("Creating FASTA dict for {}".format(sample))
    outf = os.path.splitext(reference)[0] + ".dict"
    cmd = [
        JAVA,
Пример #46
0
import os
import re
import gzip
import glob
import numpy
import subprocess
from collections import OrderedDict

from phyluce.pth import get_user_param, get_user_path

from Bio import SeqIO

JAVA = get_user_param("java", "executable")
JAVA_PARAMS = get_user_param("java", "mem")
JAR_PATH = get_user_path("java", "jar")
GATK = get_user_param("java", "gatk")


def coverage(log, sample, assembly_pth, assembly, cores, bam):
    log.info("Computing coverage with GATK for {}".format(sample))
    cwd = os.getcwd()
    # move into reference directory
    os.chdir(assembly_pth)
    cmd = [
        JAVA, JAVA_PARAMS, "-jar",
        os.path.join(JAR_PATH, GATK), "-T", "DepthOfCoverage", "-R", assembly,
        "-I", bam, "-o", "{}-coverage".format(sample), "-nt",
        str(cores), "--omitIntervalStatistics", "--omitLocusTable"
    ]
    gatk_coverage_fname = os.path.join(