Exemplo n.º 1
0
def align(pair):
    import os
    from bcbio.utils import file_exists, replace_suffix, append_stem, safe_makedir
    import subprocess
    safe_makedir("align")
    genome = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/bowtie2/GRCh37"
    out_sam = os.path.join("align",
                           os.path.basename(replace_suffix(pair[0], ".sam")))
    out_bam = replace_suffix(out_sam, ".bam")
    sorted = append_stem(out_bam, "_sorted")
    sorted_prefix = os.path.splitext(sorted)[0]
    out_index = replace_suffix(sorted, ".bai")
    if not file_exists(out_sam):
        if len(pair) == 2:
            fq1, fq2 = pair
            cmd = "bowtie2 -S {out_sam} {genome} -1 {fq1} -2 {fq2}"
        else:
            fq1 = pair[0]
            cmd = "bowtie2 -S {out_sam} {genome} {fq1}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_bam):
        cmd = "samtools view -S {out_sam} -b -o {out_bam}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(sorted):
        cmd = "samtools sort {out_bam} {sorted_prefix}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_index):
        cmd = "samtools index {sorted}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return sorted
Exemplo n.º 2
0
def align(pair):
    import os
    from bcbio.utils import file_exists, replace_suffix, append_stem, safe_makedir
    import subprocess
    safe_makedir("align")
    genome = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/bowtie2/GRCh37"
    out_sam = os.path.join("align", os.path.basename(replace_suffix(pair[0], ".sam")))
    out_bam = replace_suffix(out_sam, ".bam")
    sorted = append_stem(out_bam, "_sorted")
    sorted_prefix = os.path.splitext(sorted)[0]
    out_index = replace_suffix(sorted, ".bai")
    if not file_exists(out_sam):
        if len(pair) == 2:
            fq1, fq2 = pair
            cmd = "bowtie2 -S {out_sam} {genome} -1 {fq1} -2 {fq2}"
        else:
            fq1 = pair[0]
            cmd = "bowtie2 -S {out_sam} {genome} {fq1}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_bam):
        cmd = "samtools view -S {out_sam} -b -o {out_bam}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(sorted):
        cmd = "samtools sort {out_bam} {sorted_prefix}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_index):
        cmd = "samtools index {sorted}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return sorted
Exemplo n.º 3
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files):
    if quality_format == "illumina":
        quality_base = "64"
    else:
        quality_base = "33"

    # --times=2 tries twice remove adapters which will allow things like:
    # realsequenceAAAAAAadapter to remove both the poly-A and the adapter
    # this behavior might not be what we want; we could also do two or
    # more passes of cutadapt
    base_cmd = [
        "cutadapt", "--times=" + "2", "--quality-base=" + quality_base,
        "--quality-cutoff=20", "--format=fastq", "--minimum-length=0"
    ]
    adapter_cmd = map(lambda x: "--adapter=" + x, adapters)
    base_cmd.extend(adapter_cmd)

    if all(map(file_exists, out_files)):
        return out_files

    for in_file, out_file in zip(fastq_files, out_files):
        # if you pass an output filename, cutadapt will write some stats
        # about trimmed adapters to stdout. stat_file captures that.
        stat_file = replace_suffix(out_file, ".trim_stats.txt")
        with open(stat_file, "w") as stat_handle:
            cmd = list(base_cmd)
            cmd.extend(["--output=" + out_file, in_file])
            try:
                return_value = subprocess.check_call(cmd, stdout=stat_handle)
            except subprocess.CalledProcessError:
                cmd_string = subprocess.list2cmdline(cmd)
                logger.error("Cutadapt returned an error. The command "
                             "used to run cutadapt was: %s." % (cmd_string))
                exit(1)
    return out_files
Exemplo n.º 4
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files):
    if quality_format == "illumina":
        quality_base = "64"
    else:
        quality_base = "33"

    # --times=2 tries twice remove adapters which will allow things like:
    # realsequenceAAAAAAadapter to remove both the poly-A and the adapter
    # this behavior might not be what we want; we could also do two or
    # more passes of cutadapt
    base_cmd = ["cutadapt", "--times=" + "2", "--quality-base=" + quality_base,
                "--quality-cutoff=20", "--format=fastq", "--minimum-length=0"]
    adapter_cmd = map(lambda x: "--adapter=" + x, adapters)
    base_cmd.extend(adapter_cmd)

    if all(map(file_exists, out_files)):
        return out_files

    for in_file, out_file in zip(fastq_files, out_files):
        # if you pass an output filename, cutadapt will write some stats
        # about trimmed adapters to stdout. stat_file captures that.
        stat_file = replace_suffix(out_file, ".trim_stats.txt")
        with open(stat_file, "w") as stat_handle:
            cmd = list(base_cmd)
            cmd.extend(["--output=" + out_file, in_file])
            try:
                return_value = subprocess.check_call(cmd, stdout=stat_handle)
            except subprocess.CalledProcessError:
                cmd_string = subprocess.list2cmdline(cmd)
                logger.error("Cutadapt returned an error. The command "
                             "used to run cutadapt was: %s." % (cmd_string))
                exit(1)
    return out_files
Exemplo n.º 5
0
def convert_bam_to_sam(in_file):
    if not is_bam(in_file):
        raise ValueError("Non BAM file passed to convert_sam_to_bam: "
                         "%s" % (in_file))
    out_file = replace_suffix(in_file, ".sam")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        pysam.view("-h", "-o" + tmp_out_file, in_file)
    return out_file
Exemplo n.º 6
0
def bam2sam(in_file):
    """
    converts a bam file to a sam file
    bam2sam("file.bam") -> "file.sam"
    """
    out_file = replace_suffix(in_file, ".sam")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        pysam.view("-h", "-o" + tmp_out_file, in_file)
    return out_file
Exemplo n.º 7
0
def bam2sam(in_file):
    """
    converts a bam file to a sam file
    bam2sam("file.bam") -> "file.sam"
    """
    out_file = replace_suffix(in_file, ".sam")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        pysam.view("-h", "-o" + tmp_out_file, in_file)
    return out_file
Exemplo n.º 8
0
def split_vcf(in_file, ref_file, config, out_dir=None):
    """Split a VCF file into separate files by chromosome.
    """
    if out_dir is None:
        out_dir = os.path.join(os.path.dirname(in_file), "split")
    out_files = []
    with open(ref.fasta_idx(ref_file, config)) as in_handle:
        for line in in_handle:
            chrom, size = line.split()[:2]
            out_file = os.path.join(out_dir,
                                    os.path.basename(replace_suffix(append_stem(in_file, "-%s" % chrom), ".vcf")))
            subset_vcf(in_file, (chrom, 0, size), out_file, config)
            out_files.append(out_file)
    return out_files
Exemplo n.º 9
0
def split_vcf(in_file, ref_file, config, out_dir=None):
    """Split a VCF file into separate files by chromosome.
    """
    if out_dir is None:
        out_dir = os.path.join(os.path.dirname(in_file), "split")
    out_files = []
    with open(ref.fasta_idx(ref_file, config)) as in_handle:
        for line in in_handle:
            chrom, size = line.split()[:2]
            out_file = os.path.join(
                out_dir,
                os.path.basename(
                    replace_suffix(append_stem(in_file, "-%s" % chrom),
                                   ".vcf")))
            subset_vcf(in_file, (chrom, 0, size), out_file, config)
            out_files.append(out_file)
    return out_files
Exemplo n.º 10
0
def bamindex(in_file, samtools="samtools"):
    """
    index a bam file
    avoids use of pysam.index which is not working for indexing as of 0.7.4
    with ipython
    """
    assert (is_bam(in_file)), "bamindex requires a BAM file, got %s" % in_file
    out_file = replace_suffix(in_file, ".bai")
    if file_exists(out_file):
        return out_file
    cmd = ["samtools", "index", in_file]
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError:
        cmd_string = subprocess.list2cmdline(cmd)
        logger.error("bamindex returned an error. The command "
                     "used to run bamindex was: %s." % (cmd_string))
    return out_file
Exemplo n.º 11
0
def bamindex(in_file, samtools="samtools"):
    """
    index a bam file
    avoids use of pysam.index which is not working for indexing as of 0.7.4
    with ipython
    """
    assert(is_bam(in_file)), "bamindex requires a BAM file, got %s" % in_file
    out_file = replace_suffix(in_file, ".bai")
    if file_exists(out_file):
        return out_file
    cmd = ["samtools", "index", in_file]
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError:
        cmd_string = subprocess.list2cmdline(cmd)
        logger.error("bamindex returned an error. The command "
                     "used to run bamindex was: %s." % (cmd_string))
    return out_file
Exemplo n.º 12
0
def bam2sam(in_file, samtools="samtools"):
    """
    converts a bam file to a sam file
    bam2sam("file.bam") -> "file.sam"
    """
    assert(is_bam(in_file)), "bam2sam requires a BAM file, got %s" % in_file
    out_file = replace_suffix(in_file, ".sam")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        #pysam.view("-h", "-o" + tmp_out_file, in_file)
        cmd = "{samtools} view -h -o {tmp_out_file} {in_file}".format(**locals())
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError:
            cmd_string = subprocess.list2cmdline(cmd)
            logger.error("bam2sam returned an error. The command "
                         "used to run bam2sam was: %s." % (cmd_string))
    return out_file
Exemplo n.º 13
0
def bam2sam(in_file, samtools="samtools"):
    """
    converts a bam file to a sam file
    bam2sam("file.bam") -> "file.sam"
    """
    assert (is_bam(in_file)), "bam2sam requires a BAM file, got %s" % in_file
    out_file = replace_suffix(in_file, ".sam")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        #pysam.view("-h", "-o" + tmp_out_file, in_file)
        cmd = "{samtools} view -h -o {tmp_out_file} {in_file}".format(
            **locals())
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError:
            cmd_string = subprocess.list2cmdline(cmd)
            logger.error("bam2sam returned an error. The command "
                         "used to run bam2sam was: %s." % (cmd_string))
    return out_file
Exemplo n.º 14
0
def mark_duplicates(sam_file):
    import subprocess
    from bcbio.utils import file_exists, replace_suffix, append_stem
    fm = "/n/HSPH/local/share/java/picard/FixMateInformation.jar"
    md = "/n/HSPH/local/share/java/picard/MarkDuplicates.jar"
    jvm_opts = "-Xms750m -Xmx2000m"
    mate_fixed_file = append_stem(sam_file, "_matefixed")
    if not file_exists(mate_fixed_file):
        cmd = ("java {jvm_opts} -jar {fm} INPUT={sam_file} "
               "OUTPUT={mate_fixed_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    sam_file = mate_fixed_file
    out_file = append_stem(sam_file, "_dupemarked")
    stats_file = replace_suffix(append_stem(sam_file, "_stats"), ".txt")
    if not file_exists(out_file):
        cmd = ("java {jvm_opts} -jar {md} INPUT={sam_file} "
               "OUTPUT={out_file} METRICS_FILE={stats_file} "
               "VALIDATION_STRINGENCY=LENIENT")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Exemplo n.º 15
0
def mark_duplicates(sam_file):
    import subprocess
    from bcbio.utils import file_exists, replace_suffix, append_stem
    fm = "/n/HSPH/local/share/java/picard/FixMateInformation.jar"
    md = "/n/HSPH/local/share/java/picard/MarkDuplicates.jar"
    jvm_opts = "-Xms750m -Xmx2000m"
    mate_fixed_file = append_stem(sam_file, "_matefixed")
    if not file_exists(mate_fixed_file):
        cmd = ("java {jvm_opts} -jar {fm} INPUT={sam_file} "
               "OUTPUT={mate_fixed_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    sam_file = mate_fixed_file
    out_file = append_stem(sam_file, "_dupemarked")
    stats_file = replace_suffix(append_stem(sam_file, "_stats"), ".txt")
    if not file_exists(out_file):
        cmd = ("java {jvm_opts} -jar {md} INPUT={sam_file} "
               "OUTPUT={out_file} METRICS_FILE={stats_file} "
               "VALIDATION_STRINGENCY=LENIENT")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Exemplo n.º 16
0
def _run_cutadapt_on_single_file(base_cmd, fastq_file, out_file):
    stat_file = replace_suffix(out_file, ".trim_stats.txt")
    with open(stat_file, "w") as stat_handle:
        cmd = list(base_cmd)
        cmd.extend(["--output=" + out_file, fastq_file])
        do.run(cmd, "Running cutadapt on %s." % (fastq_file), None)
Exemplo n.º 17
0
 def test_replace_suffix_of_string(self):
     test_string = "/string/test/foo.txt"
     correct = "/string/test/foo.bar"
     out_string = utils.replace_suffix(test_string, ".bar")
     self.assertEquals(correct, out_string)
Exemplo n.º 18
0
 def test_replace_suffix_of_list(self):
     test_list = ["/list/test/foo.txt", "/list/test/foobar.txt"]
     correct = ["/list/test/foo.bar", "/list/test/foobar.bar"]
     out_list = utils.replace_suffix(test_list, ".bar")
     for c, o in zip(correct, out_list):
         self.assertEquals(c, o)
Exemplo n.º 19
0
 def test_replace_suffix_of_list(self):
     test_list = ["/list/test/foo.txt", "/list/test/foobar.txt"]
     correct = ["/list/test/foo.bar", "/list/test/foobar.bar"]
     out_list = utils.replace_suffix(test_list, ".bar")
     for c, o in zip(correct, out_list):
         self.assertEquals(c, o)
Exemplo n.º 20
0
def _run_cutadapt_on_single_file(base_cmd, fastq_file, out_file):
    stat_file = replace_suffix(out_file, ".trim_stats.txt")
    with open(stat_file, "w") as stat_handle:
        cmd = list(base_cmd)
        cmd.extend(["--output=" + out_file, fastq_file])
        do.run(cmd, "Running cutadapt on %s." % (fastq_file), None)
Exemplo n.º 21
0
 def chr_out(chrom):
     out_file = replace_suffix(append_stem(in_file, chrom), ".vcf")
     return os.path.join(out_dir, os.path.basename(out_file))
Exemplo n.º 22
0
 def test_replace_suffix_of_string(self):
     test_string = "/string/test/foo.txt"
     correct = "/string/test/foo.bar"
     out_string = utils.replace_suffix(test_string, ".bar")
     self.assertEquals(correct, out_string)
Exemplo n.º 23
0
def _run_cutadapt_on_single_file(base_cmd, fastq_file, out_file):
    stat_file = replace_suffix(out_file, ".trim_stats.txt")
    with open(stat_file, "w") as stat_handle:
        cmd = list(base_cmd)
        cmd.extend(["--output=" + out_file, fastq_file])
        _run_with_possible_error_message(cmd, stdout=stat_handle)
Exemplo n.º 24
0
def _run_cutadapt_on_single_file(base_cmd, fastq_file, out_file):
    stat_file = replace_suffix(out_file, ".trim_stats.txt")
    with open(stat_file, "w") as stat_handle:
        cmd = list(base_cmd)
        cmd.extend(["--output=" + out_file, fastq_file])
        _run_with_possible_error_message(cmd, stdout=stat_handle)