def rare_variants_famseq(self, selected_vcf, vcf_out): """Call rare variants with pedigree information using FamSeq""" # e.g. FamSeq vcf -vcfFile ../TestData/test.vcf -pedFile ../TestData/fam01.ped -output test.FamSeq.vcf -v command = "FamSeq vcf -vcfFile {selected_vcf} -pedFile {ped_file} -output {vcf_out}".format( selected_vcf=selected_vcf, ped_file=self.famseq_ped_file, vcf_out=vcf_out ) run_stage(self.state, "rare_variants_famseq", command)
def concatenate_vcfs(self, vcf_files_in, vcf_out): if (len(vcf_files_in) > 200): merge_commands = [] temp_merge_outputs = [] for n in range(0, int(math.ceil(float(len(vcf_files_in)) / 200.0))): start = n * 200 filelist = vcf_files_in[start:start + 200] filelist_command = ' '.join([vcf for vcf in filelist]) temp_merge_filename = vcf_out.rstrip( '.vcf') + ".temp_{start}.vcf".format(start=str(start)) command1 = 'bcftools merge -O z -o {vcf_out} {join_vcf_files} && bcftools index -t -f {vcf_out}; '.format( vcf_out=temp_merge_filename, join_vcf_files=filelist_command) merge_commands.append(command1) temp_merge_outputs.append(temp_merge_filename) final_merge_vcfs = ' '.join([vcf for vcf in temp_merge_outputs]) command2 = 'bcftools merge -O z -o {vcf_out} {join_vcf_files} '.format( vcf_out=vcf_out, join_vcf_files=final_merge_vcfs) merge_commands.append(command2) final_command = ''.join(merge_commands) else: filelist = ' '.join([vcf for vcf in vcf_files_in]) final_command = 'bcftools merge -O z -o {vcf_out} {vcf_files} '.format( vcf_out=vcf_out, vcf_files=filelist) run_stage(self.state, 'concatenate_vcfs', final_command)
def clip_bam(self, bam_in, sorted_bam_out): '''Clip the BAM file using Bamclipper''' bamclipper_args = '{bamclipper} -b {bam_in} -p {primer_bedpe_file} -n 1'.format( bamclipper=self.bamclipper, bam_in=bam_in, primer_bedpe_file=self.primer_bedpe_file) run_stage(self.state, 'clip_bam', bamclipper_args)
def extract_split_read_alignments(self, bam_in, splitters_bam_out): '''Extract the split-read alignments using samtools''' command = ('samtools view -h {input_bam} | ' \ 'extractSplitReads_BwaMem -i stdin | ' \ 'samtools view -Sb - > {output_bam}' .format(input_bam=bam_in, output_bam=splitters_bam_out)) run_stage(self.state, 'extract_split_read_alignments', command)
def apply_undr_rover(self, inputs, vcf_output, sample_id): '''Apply undr_rover to call variants from paired end fastq files''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('apply_undr_rover', 'cores') safe_make_dir('variants/undr_rover') safe_make_dir('variants/undr_rover/coverdir') coverdir = "variants/undr_rover/coverdir" coverfile = sample_id + ".coverage" command = 'undr_rover --primer_coords {coord_file} ' \ '--primer_sequences {primer_file} ' \ '--reference {reference} ' \ '--out {vcf_output} ' \ '--coverfile {coverdir}/{coverfile} ' \ '--proportionthresh {proportionthresh} ' \ '--absthresh {absthresh} ' \ '--max_variants {maxvariants} ' \ '{fastq_read1} {fastq_read2}'.format( coord_file=self.coord_file, primer_file=self.primer_file, reference=self.reference, vcf_output=vcf_output, coverdir=coverdir, proportionthresh=self.proportionthresh, absthresh=self.absthresh, maxvariants=self.maxvariants, coverfile=coverfile, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in) run_stage(self.state, 'apply_undr_rover', command)
def apply_cat_vcf(self, inputs, vcf_out): '''Concatenate and sort undr_rover VCF files for downstream analysis''' vcfs = ' '.join([vcf for vcf in inputs]) # safe_make_dir('variants') command = 'vcf-concat {vcfs} | vcf-sort -c | bgzip -c > {vcf_out} '.format( vcfs=vcfs, vcf_out=vcf_out) run_stage(self.state, 'apply_cat_vcf', command)
def vt_decompose_normalise(self, vcf_in, vcf_out): '''Decompose multiallelic sites and normalise representations''' command = "vt decompose -s {vcf_in} | vt normalize -r {reference} -o " \ "{vcf_out} -".format(reference=self.reference, vcf_in=vcf_in, vcf_out=vcf_out) run_stage(self.state, 'vt_decompose_normalise', command)
def apply_undr_rover(self, input, vcf_output, sample_id): '''Apply undr_rover to call variants from paired end fastq files''' fastq_read1_in = 'fastqs/' + input[11:-20] + '_R1_001.fastq.gz' fastq_read2_in = 'fastqs/' + input[11:-20] + '_R2_001.fastq.gz' coverdir = "variants/undr_rover/coverdir" coverfile = sample_id + ".coverage" if "QC" in fastq_read1_in: primer_file = self.primer_file_QC interval_file = self.interval_file_QC else: primer_file = self.primer_file_default interval_file = self.interval_file command = 'undr_rover --primer_coords {coord_file} ' \ '--primer_sequences {primer_file} ' \ '--reference {reference} ' \ '--out {vcf_output} ' \ '--coverfile {coverdir}/{coverfile} ' \ '--proportionthresh {proportionthresh} ' \ '--absthresh {absthresh} ' \ '--max_variants {maxvariants} ' \ '--fast --snvthresh 10 ' \ '{fastq_read1} {fastq_read2}'.format( coord_file=interval_file, primer_file=primer_file, reference=self.reference, vcf_output=vcf_output, coverdir=coverdir, proportionthresh=self.proportionthresh, absthresh=self.absthresh, maxvariants=self.maxvariants, coverfile=coverfile, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in) run_stage(self.state, 'apply_undr_rover', command)
def align_bwa(self, inputs, bam_out, sample_id, lib): '''Align the paired end fastq files to the reference genome using bwa''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('align_bwa', 'cores') read_group = '"@RG\\tID:{sample}\\tSM:{sample}\\tPU:lib1\\tPL:Illumina"' \ .format(sample=sample_id) if "QC" in fastq_read1_in: primer_bedpe_file = self.primer_bedpe_file_QC else: primer_bedpe_file = self.primer_bedpe_file_default command = 'bwa mem -M -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \ '| {bamclipper} -i -p {primer_bedpe_file} -n {cores} ' \ '| samtools view -u -h -q 1 -f 2 -F 4 -F 8 -F 256 - ' \ '| samtools sort -@ {cores} -o {bam}; samtools index {bam}'.format( cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bamclipper=self.bamclipper, primer_bedpe_file=primer_bedpe_file, bam=bam_out) run_stage(self.state, 'align_bwa', command)
def combine_gvcf_gatk(self, vcf_files_in, vcf_out): '''Combine G.VCF files for all samples using GATK''' merge_commands = [] temp_merge_outputs = [] for n in range(0, int(math.ceil(float(len(vcf_files_in)) / 200.0))): start = n * 200 filelist = vcf_files_in[start:start + 200] filelist_command = ' '.join( ['--variant ' + vcf for vcf in filelist]) temp_merge_filename = vcf_out.rstrip( '.vcf') + ".temp_{start}.vcf".format(start=str(start)) gatk_args_full = "java -Xmx{mem}g -jar {jar_path} -T CombineGVCFs -R {reference} " \ "--disable_auto_index_creation_and_locking_when_reading_rods " \ "{g_vcf_files} -o {vcf_out}; ".format(reference=self.reference, jar_path=self.gatk_jar, mem=self.state.config.get_stage_options('combine_gvcf_gatk', 'mem'), g_vcf_files=filelist_command, vcf_out=temp_merge_filename) merge_commands.append(gatk_args_full) temp_merge_outputs.append(temp_merge_filename) final_merge_vcfs = ' '.join( ['--variant ' + vcf for vcf in temp_merge_outputs]) gatk_args_full_final = "java -Xmx{mem}g -jar {jar_path} -T CombineGVCFs -R {reference} " \ "--disable_auto_index_creation_and_locking_when_reading_rods " \ "{g_vcf_files} -o {vcf_out}".format(reference=self.reference, jar_path=self.gatk_jar, mem=self.state.config.get_stage_options('combine_gvcf_gatk', 'mem'), g_vcf_files=final_merge_vcfs, vcf_out=vcf_out) merge_commands.append(gatk_args_full_final) final_command = ''.join(merge_commands) run_stage(self.state, 'combine_gvcf_gatk', final_command)
def apply_bcftools(self, mpileup_in, vcf_out): '''Bcftools call variants''' mpileup_in = mpileup_in # mpileup_in = ' '.join([vcf for vcf in vcf_files_in]) command = 'bcftools call -vmO v -o {vcf_out} {mpileup_in}'.format( vcf_out=vcf_out, mpileup_in=mpileup_in) run_stage(self.state, 'apply_bcftools', command)
def target_coverage_bamutil_interval(self, bam_in, coverage_out): '''Calculate target coverage using bamutil''' command = 'bam stats --basic --in {bam_in} --regionList {fragment_bed} &> {coverage_out}'.format( bam_in=bam_in, fragment_bed=self.fragment_bed, coverage_out=coverage_out) run_stage(self.state, 'target_coverage_bamutil_interval', command)
def extract_genes_bedtools(self, bam_in, bam_out): '''Extract MMR genes from the sorted BAM file''' bed_file = self.state.config.get_stage_option('extract_genes_bedtools', 'bed') command = 'bedtools intersect -abam {bam_in} -b {bed_file} > {bam_out}' \ .format(bam_in=bam_in, bed_file=bed_file, bam_out=bam_out) run_stage(self.state, 'extract_genes_bedtools', command)
def fastq_to_fasta(self, fastq_in, fasta_out): '''Convert FASTQ file to FASTA''' # -n flag says keep reads with 'N' (unknown) bases, otherwise # they would have been discarded # -Q33 means use Illumina quality scores command = 'zcat {fastq_in} | fastq_to_fasta -n -Q33 -o {fasta_out}'.format(fastq_in=fastq_in, fasta_out=fasta_out) run_stage(self.state, 'fastq_to_fasta', command)
def apply_bcf(self, inputs, vcf_out): '''Apply BCF''' vcf_in = inputs cores = self.get_stage_options('apply_bcf', 'cores') command = "bcftools filter -e \"ALT='*'\" {vcf_in} > {vcf_out}".format( cores=cores, vcf_in=vcf_in, vcf_out=vcf_out) run_stage(self.state, 'apply_bcf', command)
def apply_vep(self, inputs, vcf_out): '''Apply VEP''' vcf_in = inputs cores = self.get_stage_options('apply_vep', 'cores') vep_command = "vep --cache --dir_cache {other_vep} " \ "--assembly GRCh37 --refseq --offline " \ "--fasta {reference} " \ "--sift b --polyphen b --symbol --numbers --biotype --total_length --hgvs --format vcf " \ "--vcf --force_overwrite --flag_pick --no_stats " \ "--custom {brcaexpath},brcaex,vcf,exact,0,Clinical_significance_ENIGMA," \ "Comment_on_clinical_significance_ENIGMA,Date_last_evaluated_ENIGMA," \ "Pathogenicity_expert,HGVS_cDNA,HGVS_Protein,BIC_Nomenclature " \ "--custom {gnomadpath},gnomAD,vcf,exact,0,AF_NFE,AN_NFE " \ "--custom {revelpath},RVL,vcf,exact,0,REVEL_SCORE " \ "--plugin MaxEntScan,{maxentscanpath} " \ "--plugin ExAC,{exacpath},AC,AN " \ "--plugin dbNSFP,{dbnsfppath},REVEL_score,REVEL_rankscore " \ "--plugin dbscSNV,{dbscsnvpath} " \ "--plugin CADD,{caddpath} " \ "--fork {cores} " \ "-i {vcf_in} " \ "-o {vcf_out}".format(other_vep=self.other_vep, cores=cores, vcf_out=vcf_out, vcf_in=vcf_in, reference=self.reference, brcaexpath=self.brcaex, gnomadpath=self.gnomad, revelpath=self.revel, maxentscanpath=self.maxentscan, exacpath=self.exac, dbnsfppath=self.dbnsfp, dbscsnvpath=self.dbscsnv, caddpath=self.cadd) run_stage(self.state, 'apply_vep', vep_command)
def genotype_svtyper(self, inputs, vcf_out): '''Call genotypes on lumpy output using SVTyper''' vcf_in, [sample_bam, splitters_bam] = inputs command = 'svtyper -B {sample_bam} -S {splitters_bam} ' \ '-i {vcf_in} -o {vcf_out}' \ .format(sample_bam=sample_bam, splitters_bam=splitters_bam, vcf_in=vcf_in, vcf_out=vcf_out) run_stage(self.state, 'genotype_svtyper', command)
def fastq_to_fasta(self, fastq_in, fasta_out): '''Convert FASTQ file to FASTA''' # -n flag says keep reads with 'N' (unknown) bases, otherwise # they would have been discarded # -Q33 means use Illumina quality scores command = 'zcat {fastq_in} | fastq_to_fasta -n -Q33 -o {fasta_out}'.format( fastq_in=fastq_in, fasta_out=fasta_out) run_stage(self.state, 'fastq_to_fasta', command)
def filter_stats(self, txt_in, txt_out): '''filter the summary file to make a 'passed' file''' # Only mark samples as pass if >= 80% of target is covered at at least 10X # Set to 0 for now since I want to make everything pass awk_comm = "{if($8 >= 0){print $1\".sorted.locatit.bam\"}}" command = "awk '{awk_comm}' {summary_file} > {final_file}".format( awk_comm=awk_comm, summary_file=txt_in, final_file=txt_out) run_stage(self.state, 'filter_stats', command)
def structural_variants_lumpy(self, inputs, vcf_out): '''Call structural variants with lumpy''' sample_bam, [splitters_bam, discordants_bam] = inputs command = 'lumpyexpress -B {sample_bam} -S {splitters_bam} ' \ '-D {discordants_bam} -o {vcf}' \ .format(sample_bam=sample_bam, splitters_bam=splitters_bam, discordants_bam=discordants_bam, vcf=vcf_out) run_stage(self.state, 'structural_variants_lumpy', command)
def apply_multicov(self, bam_in, multicov): '''Samtools mpileup''' # bam_in = bam_in bams = ' '.join([bam for bam in bam_in]) safe_make_dir('coverage') command = 'bedtools multicov -bams {bams} -bed {target_bed} > {multicov}'.format( bams=bams, target_bed=self.target_bed, multicov=multicov) run_stage(self.state, 'apply_multicov', command)
def apply_summarize_picard(self, input, output): '''Summarize picard coverage''' input = input # bams = ' '.join([bam for bam in bam_in]) # safe_make_dir('variants') command = 'python coverage_summary.py > {output} '.format( output=output) run_stage(self.state, 'apply_summarize_picard', command)
def rare_variants_famseq(self, selected_vcf, vcf_out): '''Call rare variants with pedigree information using FamSeq''' # e.g. FamSeq vcf -vcfFile ../TestData/test.vcf -pedFile ../TestData/fam01.ped -output test.FamSeq.vcf -v # FamSeq methods - 1: Bayesian network; 2: Elston-Stewart algorithm; 3: MCMC command = "PATH=/vlsci/LSC0007/shared/jessica_testing/software/FamSeq/src/:$PATH ; " \ "FamSeq vcf -method 2 -vcfFile {selected_vcf} -pedFile {ped_file} -output {vcf_out}".format(selected_vcf=selected_vcf, ped_file=self.famseq_ped_file, vcf_out=vcf_out) run_stage(self.state, 'rare_variants_famseq', command)
def generate_amplicon_metrics(self, bam_in, txt_out, sample): '''Generate depth information for each amplicon and sample for heatmap plotting''' command = 'bedtools coverage -f 5E-1 -a {bed_intervals} -b {bam_in} | ' \ 'sed "s/$/ {sample}/g" > {txt_out}'.format(bed_intervals=self.interval_file, bam_in=bam_in, sample=sample, txt_out=txt_out) run_stage(self.state, 'generate_amplicon_metrics', command)
def translocations_delly(self, bams_in, vcf_out): '''Call translocatins with delly''' bams_args = ' '.join(bams_in) threads = self.state.config.get_stage_option('structural_variants_delly', 'cores') exclude = self.state.config.get_stage_option('structural_variants_delly', 'exclude') command = 'OMP_NUM_THREADS={threads} delly -t TRA -x {exclude} -o {vcf_out} -g {reference} {bams}' \ .format(threads=threads, exclude=exclude, vcf_out=vcf_out, reference=self.reference, bams=bams_args) run_stage(self.state, 'structural_variants_delly', command)
def apply_samtools_mpileup(self, bam_in, mpileup_out_bcf): '''Samtools mpileup''' # bam_in = bam_in bams = ' '.join([bam for bam in bam_in]) safe_make_dir('variants') command = 'samtools mpileup -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -go {mpileup_out_bcf} ' \ '-f {reference} {bams}'.format( mpileup_out_bcf=mpileup_out_bcf,reference=self.reference,bams=bams) run_stage(self.state, 'apply_samtools_mpileup', command)
def alignment_stats_bamtools(self, bam_in, outputs): '''Get alignment stats using Bamtools''' command = "bamtools stats -in {bam} > {out}".format(bam=bam_in, out=outputs) run_stage(self.state, 'alignment_stats_bamtools', command) def snpEff_annotate(self, vcf_in, outputs): '''Get annotation results using snpEff''' command = "SnpEff -lof {vcf_in} > {out}".format(vcf_in=vcf_in,out=outputs) run_stage(self.state, 'snpEff_annotation_tools', command)
def apply_homopolymer_ann(self, inputs, vcf_out): '''Apply HomopolymerRun annotation to undr_rover output''' vcf_in = inputs # safe_make_dir('variants') command = "echo \"##INFO=<ID=HRUN,Number=1,Type=String,Description=\"HRun\">\" > header.tmp; "\ "bcftools annotate -a {hrfile} -c CHROM,FROM,TO,HRUN " \ "-h header.tmp " \ "{vcf_in} > {vcf_out}".format(hrfile=self.hrfile,vcf_in=vcf_in,vcf_out=vcf_out) run_stage(self.state, 'apply_cat_vcf', command)
def apply_vt(self, inputs, vcf_out): '''Apply NORM''' vcf_in = inputs cores = self.get_stage_options('apply_vt', 'cores') vt_command = "{vt_path} decompose -s {vcf_in} - | {vt_path2} normalize -r {reference} " \ "-o {vcf_out} - ".format( vt_path=self.vt_path, vcf_in=vcf_in, vt_path2=self.vt_path, reference=self.reference, vcf_out=vcf_out) run_stage(self.state, 'apply_vt', vt_command)
def read_samples(self, input_pth, outputs): '''Reads the list of pass samples and touches files accordingly in the alignments/pass_samples folder''' with open(input_pth, 'r') as inputf: pass_files = inputf.read().split('\n') command_l = [] for f in pass_files: command_l.append("metrics/pass_samples/{}".format(f)) command = 'touch {}'.format(' '.join(command_l)) run_stage(self.state, 'read_samples', command)
def apply_vcfanno(self, inputs, vcf_out): '''Apply anno''' vcf_in = inputs #cores = self.get_stage_options('apply_snpeff', 'cores') anno_command = "./vcfanno_linux64 -lua {annolua} {anno} {vcf_in} > {vcf_out}".format( annolua=self.annolua, anno=self.anno, vcf_in=vcf_in, vcf_out=vcf_out) run_stage(self.state, 'apply_vcfanno', anno_command)
def translocations_delly(self, bams_in, vcf_out): '''Call translocatins with delly''' bams_args = ' '.join(bams_in) threads = self.state.config.get_stage_option( 'structural_variants_delly', 'cores') exclude = self.state.config.get_stage_option( 'structural_variants_delly', 'exclude') command = 'OMP_NUM_THREADS={threads} delly -t TRA -x {exclude} -o {vcf_out} -g {reference} {bams}' \ .format(threads=threads, exclude=exclude, vcf_out=vcf_out, reference=self.reference, bams=bams_args) run_stage(self.state, 'structural_variants_delly', command)
def apply_snpeff(self, inputs, vcf_out): '''Apply SnpEFF''' vcf_in = inputs #cores = self.get_stage_options('apply_snpeff', 'cores') apply_snpeff # mem = int(self.state.config.get_stage_options(stage, 'mem')) mem = int(self.get_stage_options('apply_snpeff', 'mem')) - 2 snpeff_command = "java -Xmx{mem}g -jar {snpeff_path} eff -c {snpeff_conf} " \ "-canon GRCh37.75 {vcf_in} | bgzip -c > {vcf_out}".format( mem=mem, snpeff_path=self.snpeff_path, snpeff_conf=self.snpeff_conf, vcf_in=vcf_in, vcf_out=vcf_out) run_stage(self.state, 'apply_snpeff', snpeff_command)
def sort_bam_sambamba(self, bam_in, sorted_bam_out): '''Sort the reads in a bam file using sambamba''' cores = self.state.config.get_stage_option('sort_bam_sambamba', 'cores') # Get the tmp directory tmp = self.state.config.get_option('tmp') # Get the amount of memory requested for the job mem = int(self.state.config.get_stage_option('sort_bam_sambamba', 'mem')) mem_limit = max(mem - 4, 1) command = 'sambamba sort --nthreads={cores} --memory-limit={mem}GB --tmpdir={tmp} --out={output_bam} {input_bam}' \ .format(cores=cores, mem=mem_limit, tmp=tmp, input_bam=bam_in, output_bam=sorted_bam_out) run_stage(self.state, 'sort_bam_sambamba', command)
def structural_variants_pindel(self, inputs, output): '''Call structural variants with pindel''' bam_in, [config_in, reference_in] = inputs cores = self.state.config.get_stage_option( 'structural_variants_pindel', 'cores') command = 'pindel -T {threads} -f {reference} -i {config} -c ALL -o {output}'.format( threads=cores, reference=reference_in, config=config_in, output=output) run_stage(self.state, 'structural_variants_pindel', command)
def run_connor(self, bam_in, bam_out): '''run connor on bam file''' command = 'connor --force -f {CONSENSUS_FREQ_THRESHOLD} ' \ '-s {MIN_FAMILY_SIZE_THRESHOLD} ' \ '-d {UMT_DISTANCE_THRESHOLD} ' \ '{bam_in} {bam_out}'.format( CONSENSUS_FREQ_THRESHOLD=self.CONSENSUS_FREQ_THRESHOLD, MIN_FAMILY_SIZE_THRESHOLD=self.MIN_FAMILY_SIZE_THRESHOLD, UMT_DISTANCE_THRESHOLD=self.UMT_DISTANCE_THRESHOLD, bam_in=bam_in,bam_out=bam_out) run_stage(self.state, 'run_connor', command)
def align_bwa(self, inputs, bam_out, sample_id): '''Align the paired end fastq files to the reference genome using bwa''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('align_bwa', 'cores') read_group = '"@RG\tID:{sample}\tSM:{sample}\tPL:Illumina"'.format(sample=sample_id) command = 'bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \ '| samtools view -b -h -o {bam} -' \ .format(cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bam=bam_out) run_stage(self.state, 'align_bwa', command)
def structural_variants_socrates(self, bam_in, variants_out, sample_dir): '''Call structural variants with Socrates''' threads = self.state.config.get_stage_option('structural_variants_socrates', 'cores') # jvm_mem is in gb jvm_mem = self.state.config.get_stage_option('structural_variants_socrates', 'jvm_mem') bowtie2_ref_dir = self.state.config.get_stage_option('structural_variants_socrates', 'bowtie2_ref_dir') output_dir = os.path.join(sample_dir, 'socrates') safe_make_dir(output_dir) command = \ ''' cd {output_dir} export _JAVA_OPTIONS='-Djava.io.tmpdir={output_dir}' Socrates all -t {threads} --bowtie2_threads {threads} --bowtie2_db {bowtie2_ref_dir} --jvm_memory {jvm_mem}g {bam} '''.format(output_dir=output_dir, threads=threads, bowtie2_ref_dir=bowtie2_ref_dir, jvm_mem=jvm_mem, bam=bam_in) run_stage(self.state, 'structural_variants_socrates', command)
def align_bwa(self, inputs, bam_out, read_id, lib, lane, sample_id): # def align_bwa(self, inputs, bam_out, sample_id): '''Align the paired end fastq files to the reference genome using bwa''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('align_bwa', 'cores') safe_make_dir('alignments/{sample}'.format(sample=sample_id)) read_group = '"@RG\\tID:{readid}\\tSM:{sample}\\tPU:lib1\\tLN:{lane}\\tPL:Illumina"' \ .format(readid=read_id, lib=lib, lane=lane, sample=sample_id) command = 'bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \ '| samtools view -b -h -o {bam} -' \ .format(cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bam=bam_out) run_stage(self.state, 'align_bwa', command)
def align_bwa(self, inputs, bam_out, sample, id): """Align the paired end fastq files to the reference genome using bwa""" fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options("align_bwa", "cores") read_group = '"@RG\\tID:{id}\\tSM:{sample}\\tPL:Illumina"'.format(sample=sample, id=id) command = ( "bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} " "| samtools view -b -h -o {bam} -".format( cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bam=bam_out, ) ) safe_make_dir("results/alignments/{sample}".format(sample=sample)) run_stage(self.state, "align_bwa", command)
def align_bwa(self, inputs, bam_out, sample): '''Align the paired end fastq files to the reference genome using bwa''' fastq_read1_in, fastq_read2_in = inputs # Get the read group information for this sample from the configuration file read_group = self.state.config.get_read_group(sample) # Get the number of cores to request for the job, this translates into the # number of threads to give to bwa's -t option cores = self.state.config.get_stage_option('align_bwa', 'cores') # Run bwa and pipe the output through samtools view to generate a BAM file command = 'bwa mem -t {cores} -R "{read_group}" {reference} {fastq_read1} {fastq_read2} ' \ '| samtools view -S -b - > {bam}' \ .format(cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bam=bam_out) run_stage(self.state, 'align_bwa', command)
def peak_picker_hires(self, mzml_in, mzml_out): '''Executes the peak picking with high_res algorithm''' cores = self.state.config.get_stage_option('baseline_filter', 'cores') command = "PeakPickerHiRes -threads {cores} -in {mzml_in} -out {mzml_out}".format(cores=cores, mzml_in=mzml_in, mzml_out=mzml_out) run_stage(self.state, 'peak_picker_hires', command)
def baseline_filter(self, mzml_in, mzml_out): '''Executes the top-hat filter to remove the baseline of an MS experiment.''' cores = self.state.config.get_stage_option('baseline_filter', 'cores') command = "BaselineFilter -threads {cores} -in {mzml_in} -out {mzml_out}".format(cores=cores, mzml_in=mzml_in, mzml_out=mzml_out) run_stage(self.state, 'baseline_filter', command)
def noise_filter_sgolay(self, mzml_in, mzml_out): '''Filter noise using Savitzky Golay''' cores = self.state.config.get_stage_option('noise_filter_sgolay', 'cores') command = "NoiseFilterSGolay -threads {cores} -in {mzml_in} -out {mzml_out}".format(cores=cores, mzml_in=mzml_in, mzml_out=mzml_out) run_stage(self.state, 'noise_filter_sgolay', command)
def resample(self, mzml_in, mzml_out): '''Resample MZML file to new sampling rate''' cores = self.state.config.get_stage_option('resample', 'cores') rate = self.state.config.get_stage_option('resample', 'rate') command = "Resampler -sampling_rate {rate} -threads {cores} -in {mzml_in} -out {mzml_out}".format(rate=rate, cores=cores, mzml_in=mzml_in, mzml_out=mzml_out) run_stage(self.state, 'resample', command)
def extract_discordant_alignments(self, bam_in, discordants_bam_out): '''Extract the discordant paired-end alignments using samtools''' command = 'samtools view -b -F 1294 {input_bam} > {output_bam}' \ .format(input_bam=bam_in, output_bam=discordants_bam_out) run_stage(self.state, 'extract_discordant_alignments', command)
def extract_chromosomes_samtools(self, bam_in, bam_out): '''Extract selected chomosomes from the bam files''' command = 'samtools view -h -b {bam_in} chr2 chr3 chr7 > {bam_out}' \ .format(bam_in=bam_in, bam_out=bam_out) run_stage(self.state, 'extract_chromosomes_samtools', command)
def bamtools_stats(self, bam_in, stats_out): '''Generate alignment stats with bamtools''' command = 'bamtools stats -in {bam} > {stats}' \ .format(bam=bam_in, stats=stats_out) run_stage(self.state, 'bamtools_stats', command)
def fastqc(self, fastq_in, dir_out): '''Quality check fastq file using fastqc''' safe_make_dir(dir_out) command = "fastqc --quiet -o {dir} {fastq}".format(dir=dir_out, fastq=fastq_in) run_stage(self.state, 'fastqc', command)
def index_reference_samtools(self, reference_in, index_file_out): '''Index the reference genome using samtools''' command = 'samtools faidx {ref}'.format(ref=reference_in) run_stage(self.state, 'index_reference_samtools', command)
def sort_bam(self, bam_in, sorted_bam_out, sorted_bam_prefix): '''Sort the reads in a bam file using samtools''' command = 'samtools sort {input_bam} {output_bam_prefix}' \ .format(input_bam=bam_in, output_bam_prefix=sorted_bam_prefix) run_stage(self.state, 'sort_bam', command)
def reference_dictionary_picard(self, reference_in, dict_file_out): '''Create a FASTA sequence dictionary for the reference using picard''' command = 'java -jar $PICARD_HOME/lib/CreateSequenceDictionary.jar ' \ 'R={ref} O={dict_file}'.format(ref=reference_in, dict_file=dict_file_out) run_stage(self.state, 'reference_dictionary_picard', command)
def feature_finder_centroid(self, mzml_in, feature_xml_out): '''The feature detection application for quantitation (centroided).''' cores = self.state.config.get_stage_option('feature_finder_centroid', 'cores') command = "FeatureFinderCentroided -threads {cores} -in {mzml_in} -out {feature_out}".format(cores=cores, mzml_in=mzml_in, feature_out=feature_xml_out) run_stage(self.state, 'feature_finder_centroid', command)
def index_reference_bowtie2(self, reference_in, index_file_out, output_prefix): '''Index the reference genome using bowtie2''' command = 'bowtie2-build {ref} {output_prefix}' \ .format(ref=reference_in, output_prefix=output_prefix) run_stage(self.state, 'index_reference_bowtie2', command)
def run_java(state, stage, jar_path, mem, args): command = java_command(jar_path, mem, args) run_stage(state, stage, command)
def index_bam(self, bam_in, index_out): '''Index a bam file with samtools''' command = 'samtools index {bam}'.format(bam=bam_in) run_stage(self.state, 'index_bam', command)