def command(self): required("", self.input_tumor) required("", self.input_normal) tmp_vcf = "{scratch}/{uuid}.vcf.gz".format(scratch=self.scratch, uuid=uuid.uuid4()) # run vardict without removing non-somatic variants, and adding "SOMATIC" INFO field for somatic variants vardict_cmd = "vardict-java " + required("-G ", self.reference_sequence) + \ optional("-f ", self.min_alt_frac) + \ required("-N ", self.tumorid) + \ optional("-r ", self.min_num_reads) + \ " -b \"{}|{}\" ".format(self.input_tumor, self.input_normal) + \ " -c 1 -S 2 -E 3 -g 4 -Q 10 " + required("", self.target_bed) + \ " | testsomatic.R " + \ " | var2vcf_paired.pl -P 0.9 -m 4.25 " + required("-f ", self.min_alt_frac) + \ " -N \"{}|{}\" ".format(self.tumorid, self.normalid) + \ " | " + fix_ambiguous_cl() + " | " + remove_dup_cl() + \ " | sed 's/Somatic;/Somatic;SOMATIC;/g' " + \ " | sed '/^#CHROM/i ##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description=\"Somatic event\">' " + \ " | vcfstreamsort -w 1000 " + \ " | bcftools view --apply-filters .,PASS " + \ " | vcfsorter.pl {} /dev/stdin ".format(self.reference_dict) + \ " | bgzip > " + tmp_vcf + " && tabix -p vcf " + tmp_vcf # annotate variants with dbSNP id annotate_cmd = "bcftools annotate --annotation {} --columns ID ".format(self.dbsnp) + \ " --output-type z --output {} ".format(self.output) + tmp_vcf + \ " && tabix -p vcf {}".format(self.output) # remove temporary vcf and tabix rm_tmp_cmd = "rm " + tmp_vcf + "*" return " && ".join([vardict_cmd, annotate_cmd, rm_tmp_cmd])
def command(self): required("", self.input_tumor) required("", self.input_normal) freq_filter = ( " bcftools filter -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -c 'from autoseq.util.bcbio import depth_freq_filter_input_stream; import sys; print depth_freq_filter_input_stream(sys.stdin, %s, \"%s\")' " % (sys.executable, 0, 'bwa')) somatic_filter = ( " sed 's/\\.*Somatic\\\"/Somatic/' " # changes \".*Somatic\" to Somatic "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -c 'from autoseq.util.bcbio import call_somatic; import sys; print call_somatic(sys.stdin.read())' " % sys.executable) blacklist_filter = " | intersectBed -a . -b {} | ".format( self.blacklist_bed) cmd = "vardict-java " + required("-G ", self.reference_sequence) + \ optional("-f ", self.min_alt_frac) + \ required("-N ", self.tumorid) + \ optional("-r ", self.min_num_reads) + \ " -b \"{}|{}\" ".format(self.input_tumor, self.input_normal) + \ " -c 1 -S 2 -E 3 -g 4 -Q 10 " + required("", self.target_bed) + \ " | testsomatic.R " + \ " | var2vcf_paired.pl -P 0.9 -m 4.25 -M " + required("-f ", self.min_alt_frac) + \ " -N \"{}|{}\" ".format(self.tumorid, self.normalid) + \ " | " + freq_filter + " | " + somatic_filter + " | " + fix_ambiguous_cl() + " | " + remove_dup_cl() + \ " | vcfstreamsort -w 1000 " + \ " | " + vt_split_and_leftaln(self.reference_sequence) + \ " | bcftools view --apply-filters .,PASS " + \ " | vcfsorter.pl {} /dev/stdin ".format(self.reference_dict) + \ conditional(blacklist_filter, self.blacklist_bed) + \ " | bgzip > {output} && tabix -p vcf {output}".format(output=self.output) return cmd
def command(self): regions_file = "{scratch}/{uuid}.regions".format(scratch=self.scratch, uuid=uuid.uuid4()) bed_to_regions_cmd = "cat {} | bed_to_regions.py > {}".format( self.target_bed, regions_file) call_somatic_cmd = " | {} -c 'from autoseq.util.bcbio import call_somatic; import sys; print call_somatic(sys.stdin.read())' ".format( sys.executable) freebayes_cmd = "freebayes-parallel {} {} ".format(regions_file, self.threads) + \ required("-f ", self.reference_sequence) + " --use-mapping-quality " + \ optional("--min-alternate-fraction ", self.min_alt_frac) + \ optional("--min-coverage ", self.min_coverage) + \ conditional(self.use_harmonic_indel_quals, "--harmonic-indel-quality") + \ optional("", self.params) + \ repeat(" ", self.input_bams) + \ """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """ + \ "| filter_erroneus_alt.py -V /dev/stdin " + \ conditional(self.somatic_only, call_somatic_cmd) + \ " | " + vt_split_and_leftaln(self.reference_sequence) + \ " | vcfuniq | bcftools view --apply-filters .,PASS " + \ " | bgzip > {output} && tabix -p vcf {output}".format(output=self.output) # reason for 'vcfuniq': freebayes sometimes report duplicate variants that need to be uniqified. rm_regions_cmd = "rm {}".format(regions_file) return " && ".join([bed_to_regions_cmd, freebayes_cmd, rm_regions_cmd])
def command(self): return "picard -XX:ParallelGCThreads=1 CollectWgsMetrics " + \ required("I=", self.input) + \ required("R=", self.reference_sequence) + \ required("O=", self.output_metrics) + \ optional("MINIMUM_MAPPING_QUALITY=", self.minimum_mapping_quality) + \ optional("MINIMUM_BASE_QUALITY=", self.minimum_base_quality) + \ optional("COVERAGE_CAP=", self.coverage_cap)
def command(self): required("input_files", self.input_files) required("output_base", self.output) required("dir_to_search", self.search_dir) basefn = os.path.basename(self.output) odir = os.path.dirname(self.output) return "multiqc " + \ required("", self.search_dir) + \ required("-o ", odir) + \ optional("-n ", basefn) + \ optional("-k ", self.data_format) + \ optional("-i ", self.report_title) + \ " --data-dir --zip-data-dir -v -f"
def command(self): return "picard -XX:ParallelGCThreads=1 -Xmx5g CollectGcBiasMetrics CHART=/dev/null" + \ required("I=", self.input) + \ required("O=", self.output_metrics) + \ required("S=", self.output_summary) + \ required("R=", self.reference_sequence) + \ optional("STOP_AFTER=", self.stop_after)
def command(self): return "cat " + \ required(" ", self.input) + \ " | bgzip " + \ required(" > ", self.output) + \ " && tabix " + \ optional("-p ", self.filetype) + \ " {} ".format(self.output)
def command(self): return "picard -XX:ParallelGCThreads=1 CollectHsMetrics " + \ required("I=", self.input) + \ required("R=", self.reference_sequence) + \ required("O=", self.output_metrics) + \ required("TI=", self.target_regions) + \ required("BI=", self.bait_regions) + \ optional("BAIT_SET_NAME=", self.bait_name) + \ repeat('METRIC_ACCUMULATION_LEVEL=', self.accumulation_level)
def command(self): return "gatk-klevebring -T HeterozygoteConcordance " + \ required("-R ", self.reference_sequence) + \ required("-V ", self.input_vcf) + \ required("-I ", self.input_bam) + \ required("-sid ", self.normalid) + \ optional("-L ", self.target_regions) + \ conditional(self.filter_reads_with_N_cigar, "--filter_reads_with_N_cigar") + \ required("-o ", self.output)
def command(self): if not self.reference and not self.targets_bed: raise ValueError("Either reference or targets_bed must be supplied") if self.reference and self.targets_bed: raise ValueError("Supply either reference OR targets_bed") tmpdir = "{}/cnvkit-{}".format(self.scratch, uuid.uuid4()) sample_prefix = stripsuffix(os.path.basename(self.input_bam), ".bam") cnvkit_cmd = "cnvkit.py batch " + required("", self.input_bam) + \ optional("-r ", self.reference) + \ conditional(self.targets_bed, "--fasta " + str(self.fasta) + " --split ") + \ conditional(self.targets_bed, "-n") + \ optional("-t ", self.targets_bed) + \ required("-d ", tmpdir) copy_cns_cmd = "cp {}/{}.cns ".format(tmpdir, sample_prefix) + required(" ", self.output_cns) copy_cnr_cmd = "cp {}/{}.cnr ".format(tmpdir, sample_prefix) + required(" ", self.output_cnr) rm_cmd = "rm -r {}".format(tmpdir) return " && ".join([cnvkit_cmd, copy_cns_cmd, copy_cnr_cmd, rm_cmd])
def command(self): # activating conda env activate_cmd = "source activate purecn-env" # running PureCN running_cmd = "PureCN.R " + required("--out ", self.outdir) + \ required("--sampleid ", self.tumorid) + \ required("--segfile ", self.input_seg) + \ required("--vcf ", self.input_vcf) + \ required("--gcgene ", self.gcgene_file) + \ required("--genome ", self.genome) + \ optional("--minpurity ", self.minpurity) + \ optional("--hzdev ", self.hzdev) + \ optional("--segfilesdev ", self.seg_sdev) + \ conditional(self.postopt, "--postoptimize") # deactivating the conda env deactivate_cmd = "source deactivate" return " && ".join([activate_cmd, running_cmd, deactivate_cmd])
def command(self): # activating conda env activate_cmd = "source activate purecn-env" # running PureCN running_cmd = "PureCN.R " + required("--out ", self.outdir) + \ required("--sampleid ", self.tumorid) + \ required("--segfile ", self.input_seg) + \ required("--tumor ", self.input_cnr) + \ required("--vcf ", self.input_vcf) + \ required("--genome ", self.genome) + \ optional("--funsegmentation ", self.funseg) + \ optional("--minpurity ", self.minpurity) + \ optional("--hzdev ", self.hzdev) + \ optional("--maxnonclonal ", self.maxnonclonal) + \ optional("--minaf ", self.minaf) + \ optional("--error ", self.error) + \ conditional(self.postopt, "--postoptimize") # deactivating the conda env deactivate_cmd = "conda deactivate" # touching required output files touch_cmd = "touch {} {} {} {}".format(self.out_csv, self.out_genes, self.out_variants, self.output) return " && ".join( [activate_cmd, running_cmd, deactivate_cmd, touch_cmd])
def command(self): activate_env_cmd = "source activate qdnaseqenv" qdnaseq_cmd = "qdnaseq.R " + \ required("--bam ", self.input) + \ required("--output ", self.output) + \ optional("--background ", self.background) deactivate_env_cmd = "source deactivate" return "{} && {} && {} ".format( activate_env_cmd, qdnaseq_cmd, deactivate_env_cmd, )
def command(self): return "target_coverage_histogram.py " + \ required("--targets ", self.input_bed) + \ required(" ", self.input_bam) + \ optional("--min_basequal ", self.min_basequal) + \ required("> ", self.output)