query information. Makes use of ExAC, dbSNP, 1000 genomes, clinvar, cosmic and effects annotations. The general idea is to prioritize deleterious variants missing or present at a low frequency in the population, or secondarily identified in external databases like COSMIC and ClinVar. """ import collections import csv import re from bcbio import utils from bcbio.distributed.transaction import file_transaction from bcbio.pipeline import datadict as dd from bcbio.provenance import do from bcbio.variation import population, vcfutils geneimpacts = utils.LazyImport("geneimpacts") cyvcf2 = utils.LazyImport("cyvcf2") def handle_vcf_calls(vcf_file, data, orig_items): """Prioritize VCF calls based on external annotations supplied through GEMINI. """ if not _do_prioritize(orig_items): return vcf_file else: ann_vcf = population.run_vcfanno(vcf_file, data) if ann_vcf: priority_file = _prep_priority_filter_vcfanno(ann_vcf, data) return _apply_priority_filter(ann_vcf, priority_file, data) # No data available for filtering, return original file else:
from distutils.version import LooseVersion import os import numpy as np from bcbio import bam, broad, utils from bcbio.bam import is_paired from bcbio.log import logger from bcbio.distributed.transaction import file_transaction from bcbio.pipeline import config_utils from bcbio.pipeline.shared import subset_variant_regions from bcbio.pipeline import datadict as dd from bcbio.provenance import do from bcbio.variation import annotation, bamprep, bedutils, gatk, vcfutils, ploidy cyvcf2 = utils.LazyImport("cyvcf2") def _add_tumor_params(paired, items, gatk_type): """Add tumor/normal BAM input parameters to command line. """ params = [] if not paired: raise ValueError( "Specified MuTect2 calling but 'tumor' phenotype not present in batch\n" "https://bcbio-nextgen.readthedocs.org/en/latest/contents/" "pipelines.html#cancer-variant-calling\n" "for samples: %s" % ", ".join([dd.get_sample_name(x) for x in items])) if gatk_type == "gatk4": params += ["-I", paired.tumor_bam]
""" import contextlib import math import os import toolz as tz from bcbio import bam, broad, utils from bcbio.bam import ref from bcbio.distributed.transaction import file_transaction, tx_tmpdir from bcbio.log import logger from bcbio.pipeline import config_utils from bcbio.pipeline import datadict as dd from bcbio.provenance import do pysam = utils.LazyImport("pysam") @contextlib.contextmanager def tobam_cl(data, out_file, is_paired=False): """Prepare command line for producing de-duplicated sorted output. - If no deduplication, sort and prepare a BAM file. - If paired, then use samblaster and prepare discordant outputs. - If unpaired, use biobambam's bammarkduplicates """ do_dedup = _check_dedup(data) umi_consensus = dd.get_umi_consensus(data) with file_transaction(data, out_file) as tx_out_file: if not do_dedup: yield (sam_to_sortbam_cl(data, tx_out_file), tx_out_file)
import os import toolz as tz import numpy as np import pandas as pd import pybedtools from bcbio.log import logger from bcbio import utils from bcbio.pipeline import datadict as dd from bcbio.provenance import do from bcbio.structural import convert from bcbio.distributed.transaction import file_transaction, tx_tmpdir from bcbio.variation import bedutils, vcfutils, ploidy, validateplot mpl = utils.LazyImport("matplotlib") plt = utils.LazyImport("matplotlib.pyplot") sns = utils.LazyImport("seaborn") # -- VCF based validation def _evaluate_vcf(calls, truth_vcf, work_dir, data): out_file = os.path.join( work_dir, os.path.join("%s-sv-validate.csv" % dd.get_sample_name(data))) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(
Handles data normalization and plotting, emphasizing comparisons on methodology differences. """ import collections import os from distutils.version import LooseVersion import numpy as np import pandas as pd from bcbio.log import logger from bcbio import utils from bcbio.variation import bamprep mpl = utils.LazyImport("matplotlib") plt = utils.LazyImport("matplotlib.pyplot") mpl_ticker = utils.LazyImport("matplotlib.ticker") sns = utils.LazyImport("seaborn") def classifyplot_from_plotfiles(plot_files, out_csv, outtype="png", title=None, size=None): """Create a plot from individual summary csv files with classification metrics. """ dfs = [pd.read_csv(x) for x in plot_files] samples = [] for df in dfs: for sample in df["sample"].unique(): if sample not in samples: samples.append(sample) df = pd.concat(dfs) df.to_csv(out_csv, index=False)
import os from bcbio import utils from bcbio.utils import file_exists, get_R_exports, safe_makedir from bcbio.bam import ref from bcbio.heterogeneity import chromhacks import bcbio.pipeline.datadict as dd from bcbio.pipeline import config_utils, shared from bcbio.ngsalign.postalign import dedup_bam from bcbio.distributed.transaction import file_transaction from bcbio.provenance import do from bcbio.variation import vardict from bcbio import broad, bam from bcbio.variation import gatk, vcfutils from bcbio.rnaseq import gtf pybedtools = utils.LazyImport("pybedtools") def rnaseq_gatk_variant_calling(data): data = dd.set_deduped_bam(data, dedup_bam(dd.get_work_bam(data), data)) data = gatk_splitreads(data) data = gatk_rnaseq_calling(data) return data def gatk_splitreads(data): """ use GATK to split reads with Ns in the CIGAR string, hard clipping regions that end up in introns """ broad_runner = broad.runner_from_config(dd.get_config(data))
"""Integration with the DNAnexus platform using the API. Looks up and fills in sample locations from inputs folders in a DNAnexus project. """ import os import toolz as tz from bcbio import utils from bcbiovm.shared import retriever as sret dxpy = utils.LazyImport("dxpy") # ## DNAnexus specific functionality KEY = "dx" CONFIG_KEY = "dnanexus" def _authenticate(): assert os.environ.get("DX_AUTH_TOKEN"), \ "Need to set DX_AUTH_TOKEN for file retrieval from DNAnexus" dxpy.set_security_context({ "auth_token_type": "bearer", "auth_token": os.environ["DX_AUTH_TOKEN"] }) def _is_remote(f): return f.startswith("%s:" % KEY)
from datetime import datetime import collections import functools import os import gzip import pytz import re import socket import pandas as pd import pickle from bcbio import utils from bcbio.graph.collectl import load_collectl mpl = utils.LazyImport("matplotlib") plt = utils.LazyImport("matplotlib.pyplot") pylab = utils.LazyImport("pylab") def _setup_matplotlib(): # plt.style.use('ggplot') mpl.use('Agg') pylab.rcParams['image.cmap'] = 'viridis' pylab.rcParams['figure.figsize'] = (35.0, 12.0) # pylab.rcParams['figure.figsize'] = (100, 100) pylab.rcParams['figure.dpi'] = 300 pylab.rcParams['font.size'] = 25 def get_bcbio_nodes(path):
""" import os import six import pandas as pd import pybedtools from bcbio import utils from bcbio.utils import rbind, file_exists from bcbio.provenance import do from bcbio.distributed.transaction import file_transaction import bcbio.pipeline.datadict as dd from collections import defaultdict from itertools import repeat mpl = utils.LazyImport("matplotlib") plt = utils.LazyImport("matplotlib.pyplot") pylab = utils.LazyImport("pylab") backend_pdf = utils.LazyImport("matplotlib.backends.backend_pdf") sns = utils.LazyImport("seaborn") def _calc_regional_coverage(in_bam, chrom, start, end, samplename, work_dir): """ given a BAM and a region, calculate the coverage for each base in that region. returns a pandas dataframe of the format: chrom position coverage name where the samplename column is the coverage at chrom:position """ region_bt = pybedtools.BedTool("%s\t%s\t%s\n" % (chrom, start, end), from_string=True).saveas()
""" from __future__ import print_function import os from bcbio.log import logger from bcbio import utils import bcbio.pipeline.datadict as dd from bcbio.pipeline import config_utils from bcbio.distributed.transaction import file_transaction from bcbio.rnaseq import kallisto, sailfish, gtf from bcbio.provenance import do from bcbio.utils import file_exists, safe_makedir from bcbio.bam import fasta h5py = utils.LazyImport("h5py") import numpy as np import pandas as pd def get_fragment_length(data): """ lifted from https://github.com/pmelsted/pizzly/scripts/pizzly_get_fragment_length.py """ h5 = kallisto.get_kallisto_h5(data) cutoff = 0.95 with h5py.File(h5) as f: x = np.asarray(f['aux']['fld'], dtype='float64') y = np.cumsum(x) / np.sum(x) fraglen = np.argmax(y > cutoff)