示例#1
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" % dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" % (info.contig, info.aligned))
            out["base"] = out_file
    return out
示例#2
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis:
       1. Extract the unmapped reads
       2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
       3. Report viruses that are in more than 50% covered by at least 5x
    """
    source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [
            x for x in dd.get_viral_files(data)
            if os.path.basename(x) == "%s.fa" % viral_target
        ]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(
                utils.safe_makedir(out_dir), "%s-%s.bam" %
                (dd.get_sample_name(data),
                 utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-completeness.txt" % utils.splitext_plus(
                viral_bam)[0]
            cores = dd.get_num_cores(data)
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        tmpfile = "%s-tmp" % utils.splitext_plus(
                            tx_out_file)[0]
                        cmd = (
                            "samtools view -u -f 4 {bam_file} | "
                            "bamtofastq collate=0 | "
                            "bwa mem -t {cores} {viral_ref} - | "
                            "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                            "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}"
                        )
                        do.run(cmd.format(**locals()),
                               "Align unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    sample_name = dd.get_sample_name(data)
                    mosdepth_prefix = os.path.splitext(viral_bam)[0]
                    cmd = (
                        "mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
                        "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
                        "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
                        "echo '## Sample: {sample_name}' >> {tx_out_file} && "
                        "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && "
                        "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
                        "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
                        "sort -n -r -k 5,5 >> {tx_out_file}")
                    do.run(cmd.format(**locals()),
                           "Analyse coverage of viral genomes")
            out["base"] = out_file
            out["secondary"] = []
    return out
示例#3
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis:
       1. Extract the unmapped reads
       2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
       3. Report viruses that are in more than 50% covered by at least 5x
    """
    source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0]
            cores = dd.get_num_cores(data)
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Align unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    sample_name = dd.get_sample_name(data)
                    mosdepth_prefix = os.path.splitext(viral_bam)[0]
                    cmd = ("mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
                           "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
                           "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
                           "echo '## Sample: {sample_name}' >> {tx_out_file} && "
                           "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && "
                           "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
                           "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
                           "sort -n -r -k 5,5 >> {tx_out_file}")
                    do.run(cmd.format(**locals()), "Analyse coverage of viral genomes")
            out["base"] = out_file
            out["secondary"] = []
    return out
示例#4
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [
            x for x in dd.get_viral_files(data)
            if os.path.basename(x) == "%s.fa" % viral_target
        ]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(
                utils.safe_makedir(out_dir), "%s-%s.bam" %
                (dd.get_sample_name(data),
                 utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(
                            tx_out_file)[0]
                        cmd = (
                            "samtools view -u -f 4 {bam_file} | "
                            "bamtofastq collate=0 | "
                            "bwa mem -t {cores} {viral_ref} - | "
                            "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                            "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}"
                        )
                        do.run(cmd.format(**locals()),
                               "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" %
                                         dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" %
                                                 (info.contig, info.aligned))
            out["base"] = out_file
    return out
示例#5
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis:
       1. Extract the unmapped reads
       2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
       3. Report viruses that are in more than 50% covered by at least 5x
    """
    source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
    viral_target = "gdc-viral"
    out = {}
    viral_refs = [
        x for x in dd.get_viral_files(data)
        if os.path.basename(x) == "%s.fa" % viral_target
    ]
    if viral_refs and utils.file_exists(viral_refs[0]):
        viral_ref = viral_refs[0]
        viral_bam = os.path.join(
            utils.safe_makedir(out_dir), "%s-%s.bam" %
            (dd.get_sample_name(data),
             utils.splitext_plus(os.path.basename(viral_ref))[0]))
        out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0]
        cores = dd.get_num_cores(data)
        samtools = config_utils.get_program("samtools", data["config"])
        bamtofastq = config_utils.get_program("bamtofastq", data["config"])
        bamsort = config_utils.get_program("bamsort", data["config"])
        if not utils.file_uptodate(out_file, bam_file):
            if not utils.file_uptodate(viral_bam, bam_file):
                with file_transaction(data, viral_bam) as tx_out_file:
                    tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                    tmpbam = "%s-tmpbam" % utils.splitext_plus(tx_out_file)[0]
                    # the weirdest bug
                    # in bcbio1.2.9a ipython (only ipython not multicore) runs fail after this step with bgzf error, see issue 3581
                    # what helps is to samtools view the file to restore the proper EOF
                    cmd = (
                        f"{samtools} view -u -f 4 {bam_file} | "
                        f"{bamtofastq} collate=0 | "
                        f"bwa mem -t {cores} {viral_ref} - | "
                        f"{bamsort} tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                        f"inputformat=sam index=1 indexfilename={tmpbam}.bai O={tmpbam}.bam &&"
                        f"{samtools} view -bh {tmpbam}.bam > {tx_out_file} && "
                        f"{samtools} index {tx_out_file}")
                    do.run(cmd, "Align unmapped reads to viral genome")
            total_reads = _count_reads(bam_file, data)
            assert total_reads > 0, 'Reads count is {total_reads}, is there a bug in counting the read count? {bam_file}'.format(
                **locals())
            with file_transaction(data, out_file) as tx_out_file:
                sample_name = dd.get_sample_name(data)
                mosdepth_prefix = os.path.splitext(viral_bam)[0]
                mosdepth = config_utils.get_program("mosdepth", data)
                cmd = (
                    "{mosdepth} -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
                    "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
                    "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
                    "echo '## Sample: {sample_name}' >> {tx_out_file} && "
                    "echo '#virus\tsize\tdepth\t1x\t5x\t25x\treads\treads_pct' >> {tx_out_file} && "
                    "paste "
                    "<(zcat {mosdepth_prefix}.regions.bed.gz) "
                    "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) "
                    "<(samtools idxstats {viral_bam} | grep -v '*') | "
                    "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | "
                    "sort -n -r -k 5,5 >> {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Analyse coverage of viral genomes")
                if chromhacks.get_EBV(data):
                    ref_file = dd.get_ref_file(data)
                    work_bam = dd.get_work_bam(data)
                    ebv = chromhacks.get_EBV(data)
                    mosdepth_prefix = os.path.splitext(work_bam)[0] + "-EBV"
                    mosdepth = config_utils.get_program("mosdepth", data)
                    cmd = (
                        "{mosdepth} -t {cores} {mosdepth_prefix} {work_bam} -n --thresholds 1,5,25 --by "
                        "<(grep {ebv} {ref_file}.fai | awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}') && "
                        "paste "
                        "<(zcat {mosdepth_prefix}.regions.bed.gz) "
                        "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) "
                        "<(samtools idxstats {work_bam} | grep {ebv}) | "
                        "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | "
                        "sort -n -r -k 5,5 >> {tx_out_file}")
                    do.run(cmd.format(**locals()), "Analyse coverage of EBV")

        out["base"] = out_file
        out["secondary"] = []
    return out