def indexRefbwa(inputs, outputs): """ Index reference file for use with BWA. """ ref = inputs flagFile = outputs runStageCheck('indexReferenceBWA', flagFile, ref)
def fastqc(inputs, outputs): """ Run FastQC on each fastq file. """ sequence = inputs fastqc_dest, flagFile = outputs runStageCheck('fastqc', flagFile, fastqc_dir, sequence)
def tumour_sam_to_bam(inputs, outputs): samFile, _success = inputs bamFile, flagFile = outputs match = re.search('(.*)\.sam', samFile) bamFile = match.group(1) + '.bam' runStageCheck('tumour_sam_to_bam', flagFile, samFile, bamFile)
def simulate_variants(inputs, outputs): variant_fasta, _success = inputs fasta_out, flagFile = outputs match = re.search('(.*)\/([a-zA-Z0-9_\.]+)\.fa', variant_fasta) variant_fasta = '%s/%s' % (match.group(1), match.group(2)) runStageCheck('simulate_variants', flagFile, javasim_libdir, javasim_bindir, ref_fasta, variant_fasta)
def tumour_sort_bam(inputs, outputs): bamFile, _success = inputs sortBamFile, flagFile = outputs match = re.search('(.*)\.bam', bamFile) sortBamPrefix = match.group(1) + '.sorted.bam' runStageCheck('tumour_sort_bam', flagFile, bamFile, sortBamPrefix)
def fastqc_trimmed(inputs, outputs): """ Run FastQC on each trimmed paired fastq file. """ sequence = inputs fastqc_dest, flagFile = outputs runStageCheck('fastqc', flagFile, fastqc_dir, sequence)
def indexRefSamtools(inputs, outputs): """ Index reference file for use with samtools. """ ref = inputs out, flagFile = outputs runStageCheck('indexReferenceSAM', flagFile, ref)
def igvcountDedupedBams(inputs, outputs): """ Use igvtools count to create a .tdf file for the deduped bam files, to improve viewing of the bam coverage in igv. Note that this actually goes from the fixMate-ed bams. """ bam, _success = inputs outfile, flag_file = outputs print "igvtools count on %s" % os.path.basename(bam) runStageCheck('igvcount', flag_file, bam, outfile)
def vcfIndexIndels(inputs, outputs): """ Use bgzip and tabix to prepare raw indels vcf for vcftools handling. """ vcf, _idx, _success = inputs zipfile, tabix_index, flag_file = outputs print "bgzip and tabix (for vcftools) on %s" % vcf runStageCheck('indexVCF', flag_file, vcf)
def igvcountRecalibratedBams(inputs, outputs): """ Use igvtools count to create a .tdf file for the recalibrated bam files, to improve viewing of the bam coverage in igv. """ bam, _success = inputs outfile, flag_file = outputs print "igvtools count on %s" % os.path.basename(bam) runStageCheck('igvcount', flag_file, bam, outfile)
def indexRealignedBams(inputs, outputs): """ Index the locally realigned bams using samtools. """ bam, _success = inputs output, flag_file = outputs print "samtools index on %s" % os.path.basename(bam) runStageCheck('indexBam', flag_file, bam)
def indexRealignedBams(inputs, outputs): """ Index the locally realigned bams using samtools. """ bam = inputs output, flag_file = outputs print "samtools index on %s" % os.path.basename(bam) runStageCheck('indexBam', flag_file, bam)
def countRunBam(inputs, outputs): """ Run samtools flagstat on the initial per-lane, per-run bam file. """ bam, _success = inputs output, flag_file = outputs print "Running samtools flagstat on %s" % bam runStageCheck('flagstat', flag_file, bam, output)
def countDedupedBam(inputs, outputs): """ Run samtools flagstat on the deduped bam file. """ bam, _success = inputs output, flag_file = outputs print "Running samtools flagstat on %s" % bam runStageCheck('flagstat', flag_file, bam, output)
def cleanup(inputs, outputs): bamIndex, _success = inputs flagFile = outputs tumour_base_name = '%s/%s' % (outdir, tumour_outname) normal_base_name = '%s/%s' % (outdir, normal_outname) runStageCheck('cleanup', flagFile, tumour_base_name, normal_base_name)
def indexMergedBams(inputs, outputs): """ Index the merged bams using samtools. """ bam, _success = inputs output, flag_file = outputs print "samtools index on %s" % os.path.basename(bam) runStageCheck('indexBam', flag_file, bam)
def indexDedupedBams(inputs, outputs): """ Index the de-duplicated bams using samtools. Note that this actually goes from the fixMate-ed bams. """ bam, _success = inputs output, flag_file = outputs print "samtools index on %s" % os.path.basename(bam) runStageCheck('indexBam', flag_file, bam)
def samToBam(inputs, outputs): """ Convert sam to bam and sort, using Picard. """ output, flag_file = outputs sam, _success = inputs print "converting to sorted bam: %s" % os.path.basename(sam) runStageCheck('samToSortedBam', flag_file, sam, output)
def indexBaseQualRecalBam(inputs, outputs): """ Index the locally realigned bams using samtools. """ bam, _baseRecalBam_success = inputs output, flagFile = outputs print "samtools index on %s" % bam runStageCheck('indexBam', flagFile, bam)
def BWAmem(inputs, outputs): """ Align sequence reads to the reference genome using bwa mem. """ seq1, seq2 = inputs output, flag_file = outputs print "bwa mem on %s" % os.path.basename(seq1) runStageCheck('BWAmem', flag_file, ref_files['bwa_reference'], seq1, seq2, output)
def cmh2gwas(inputs, outputs): """ Convert the results of the CMH test to GWAS format for viewing in IGV. """ test_results = inputs gwas, flag_file = outputs logFile = mkLogFile(logDir, test_results, '.gwas.log') print "convert CMH results to GWAS: %s" % os.path.basename(test_results) runStageCheck('cmh2gwas', flag_file, test_results, gwas)
def mpileuptosync(inputs,outputs): """ Convert mpileup to sync format for use in Popoolation2 """ input_mpileup = inputs output_sync, flag_file = outputs logFile = mkLogFile(logDir, input_mpileup, '.sync.log') print "convert from mpileup to sync format: %s" % os.path.basename(input_mpileup) runStageCheck('mpileuptosync', flag_file, input_mpileup, output_sync)
def mpileup(inputs,outputs): """ Mpileup of dedupped, realigned bams - using samtools """ bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs output_mpileup, flag_file = outputs logFile = mkLogFile(logDir, bam0, '.mpileup.log') print "Make mpileup using Samtools: %s and other 9 files" % os.path.basename(bam0) runStageCheck('mpileup', flag_file, ref_files['masked_reference'], bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, output_mpileup)
def selectHighQualVariants(inputs,outputs): """ Select high quality variants from the initial FreeBayes variant calls to act as input for Base Quality Score Recalibration """ input_vcf = inputs output_vcf, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.highqual.log') print "select high quality variants using GATK SelectVariants: %s" % os.path.basename(input_vcf) runStageCheck('selectHighQualVariants', flag_file, ref_files['fasta_reference'], input_vcf, output_vcf)
def freebayes1(inputs,outputs): """ First run of Freebayes, i.e. before Base Quality Score Recalibration """ bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs output_vcf, flag_file = outputs logFile = mkLogFile(logDir, bam1, '.freebayes.log') print "call variants using FreeBayes: %s" % os.path.basename(bam1) runStageCheck('freebayes1', flag_file, bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, ref_files['fasta_reference'], output_vcf)
def filterIndels(inputs, outputs): """ Use GATK VariantFiltration to filter raw INDEL calls. """ input_vcf, _idx, _success = inputs output_vcf, _idxout, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log') print "filtering indels from %s" % input_vcf runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf)
def dedup(inputs, outputs): """ Remove apparent duplicates from merged bams using Picard MarkDuplicates. """ input_bam, _success = inputs output_bam, flag_file = outputs logFile = mkLogFile(logDir, input_bam, '.dedup.log') print "de-duping %s" % os.path.basename(input_bam) runStageCheck('dedup', flag_file, input_bam, logFile, output_bam)
def realignIntervals(inputs, outputs): """ Run GATK RealignTargetCreator to find suspect intervals for realignment. """ bam, _success = inputs output_intervals, flag_file = outputs logFile = mkLogFile(logDir, bam, '.realignIntervals.log') print "calculating realignment intervals for %s" % os.path.basename(bam) runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], bam, ref_files['indels_realign_goldstandard'], ref_files['indels_realign_1000G'], logFile, output_intervals)
def filterHapVcfs(inputs, outputs): """ Use GATK VariantFiltration to filter raw sample HAP calls. """ input_vcf, _idx, _success = inputs output_vcf, _idxout, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.filterSNPs.log') # print "filtering haplotyper vcf from %s" % input_vcf runStageCheck('filterHapVcfs', flag_file, fasta_reference, input_vcf, logFile, output_vcf)
def callVariantRecalibrator(inputs, outputs): """ Use GATK VariantFiltration to filter raw SNP calls. """ input_vcf, _idx, _success = inputs output_recal, output_tranches, output_R, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.VarRecal.log') print "VariantRecalibrator -> %s" % input_vcf runStageCheck('callVariantRecalibrator', flag_file, fasta_reference, input_vcf, ref_files['hapmap'],ref_files['omnimap'], ref_files['1kghc'], ref_files['dbsnp'], output_recal, output_tranches, output_R, logFile)
def callHAP(inputs, outputs): """ Use GATK HaplotypeCaller to call SNPs/Indels from recalibrated bams. """ bam, _success = inputs output_vcf, _idx, flag_file = outputs logFile = mkLogFile(logDir, bam, '.callHAPs.log') #print "calling haplotypes from %s" % bam runStageCheck('callHAP', flag_file, fasta_reference, bam, ref_files['dbsnp'], logFile, output_vcf)
def baseQualRecalCount(inputs, outputs): """ GATK CountCovariates, first step of base quality score recalibration. """ bam, _success = inputs output_csv, flag_file = outputs logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log') print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(bam) runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv)
def callIndels(inputs, outputs): """ Use GATK UnifiedGenotyper to call indels from recalibrated bams. """ bam, _success = inputs output_vcf, _idx, flag_file = outputs logFile = mkLogFile(logDir, bam, '.callIndels.log') print "calling Indels from %s" % bam runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf)
def getEnsemblAnnotations(inputs, outputs): """ Annotate vcf using ENSEMBL variant effect predictor. """ vcf, _idx, _success = inputs output, flag_file = outputs logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log') print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(vcf) runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
def leftAlign(inputs,outputs): """ GATK LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it 'command': "java -Xmx22g -jar " + GATK_HOME + "GenomeAnalysisTK.jar -allowPotentiallyMisencodedQuals -T LeftAlignIndels -I %input -R %ref -o %output" """ bam, _realign_success = inputs output_bam, flagFile = outputs runStageCheck('leftalignindels', flagFile, bam, fasta_reference, output_bam) remove_GATK_bai(output_bam)
def realignIntervals(inputs, outputs): """ Run GATK RealignTargetCreator to find suspect intervals for realignment. """ input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23 = inputs output_intervals, flag_file = outputs logFile = mkLogFile(logDir, input_bam0, '.realignIntervals.log') print "calculating realignment intervals for %s" % os.path.basename(input_bam0) runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23, logFile, output_intervals)
def generate_normal_reads(ref_fasta, outputs): fastq_r1, fastq_r2, flagFile = outputs normal_out = '%s/%s' % (outdir, normal_outname) cov = norm_cov + int( (tumour_cov / 2)) #create normal component assuming half the "tumour" is normal runStageCheck('generate_reads_simseq', flagFile, simseq_dir, javasim_libdir, read_len, frag_len, frag_std, ref_fasta, normal_reads, normal_out)
def getEnsemblAnnotations(inputs, outputs): """ Annotate vcf using ENSEMBL variant effect predictor. """ vcf, _idx, _success = inputs output, flag_file = outputs logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log') print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename( vcf) runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
def collateReadCounts(inputs, outputs): """ Collate read counts from samtools flagstat output into a table. """ # Note expected input and output directories are effectively hard-coded in_dir = sambam_dir out_dir = results_dir flag_file = outputs[-1] print "Collating read counts" runStageCheck('collateReadcounts', flag_file, in_dir, out_dir)
def mergeBams(inputs, outputs): """ Merge the sorted bams together for each sample. Picard should cope correctly if there is only one input. """ bams = [bam for [bam, _success] in inputs] output, flag_file = outputs baminputs = ' '.join(["INPUT=%s" % bam for bam in bams]) print "merging %s into %s" % (",".join( [os.path.basename(bam) for bam in bams]), os.path.basename(output)) runStageCheck('mergeBams', flag_file, baminputs, output)
def realign(inputs, outputs): """ Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals. """ [intervals, _success], [input_bam] = inputs output_bam, flag_file = outputs logFile = mkLogFile(logDir, input_bam, '.realign.log') print "realigning %s" % os.path.basename(input_bam) runStageCheck('realign', flag_file, ref_files['fasta_reference'], input_bam, intervals, logFile, output_bam) remove_GATK_bai(output_bam)
def create_tumour_mixture(inputs, outputs): fastq_r1, fastq_r2, _success = inputs mix_r1, mix_r2, flagFile = outputs match = re.search(r'(.*)\/([a-zA-Z0-9_\.]+)_R(1|2)\.fq', fastq_r1) tumour_base = '%s/%s' % (match.group(1), match.group(2)) norm_base = '%s/%s' % (outdir, normal_outname) mix_base = '%s/%s_mixture' % (match.group(1), match.group(2)) runStageCheck('create_tumour_mixture', flagFile, tumour_base, norm_base, mix_base)
def finalDepthOfCoverage(inputs, outputs): """ Use GATK DepthOfCoverage to get coverage statistics. """ bam, _success = inputs flag_file = outputs[-1] output_example = outputs[0] output_base = os.path.splitext(output_example)[0] print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'], bam, output_base)
def baseQualRecalCount(inputs, outputs): """ GATK CountCovariates, first step of base quality score recalibration. """ bam, _success = inputs output_csv, flag_file = outputs logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log') print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename( bam) runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv)
def baseQualRecalTabulate(inputs, outputs): """ GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates. """ [input_csv, _success], [input_bam] = inputs output_bam, flag_file = outputs logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log') print "recalibrate base quality scores using GATK on %s" % os.path.basename( input_bam) runStageCheck('baseQualRecalTabulate', flag_file, input_bam, ref_files['fasta_reference'], input_csv, logFile, output_bam) remove_GATK_bai(output_bam)
def align_tumour_reads(inputs, outputs): fastq_r1, fastq_r2, _success = inputs samFile, flagFile = outputs outSam = '%s/%s.sam' % (outdir, tumour_outname) if aligner == 'bowtie': runStageCheck('align_tumour_reads_bowtie', flagFile, threads, ref_fasta, fastq_r1, fastq_r2, outSam, tumour_outname, tumour_outname) elif aligner == 'bwa': tumour_rg = '"@RG\\tID:%s\\t@SM:%s"' % (tumour_outname, tumour_outname) runStageCheck('align_tumour_reads_bwa', flagFile, threads, ref_fasta, fastq_r1, fastq_r2, tumour_rg, outSam) else: raise ValueError('Invalid aligner specified!')
def generate_tumour_reads(inputs, outputs): variant_fasta, _success = inputs fastq_r1, fastq_r2, flagFile = outputs match = re.search('(.*)\/([a-zA-Z0-9_\.]+)\.fa_reference.fa', variant_fasta) tumour_out = '%s/%s' % (match.group(1), match.group(2)) cov = int(tumour_cov / 2) #assume variants occur only on one chromosome # have to tweak tumour reads based on generated reference genome proc = subprocess.Popen(["wc", variant_fasta], stdout=subprocess.PIPE) wc_out = proc.stdout.readline().split() tum_chrom_len = int( wc_out[2]) - 4 #4 = number of characters in the fasta header tumour_reads = (tum_chrom_len / frag_len) * int((tumour_cov / 2)) runStageCheck('generate_reads_simseq', flagFile, simseq_dir, javasim_libdir, read_len, frag_len, frag_std, variant_fasta, tumour_reads, tumour_out)
def bwaPE(inputs, outputs): """ Aligns two paired-end fastq files to a reference genome to produce a sam file. """ seq1, seq2 = sorted(inputs) output, flag_file = outputs fastq_name = os.path.basename(seq1) sample = fastq_metadata[fastq_name]['sample'] runID = fastq_metadata[fastq_name]['run_id'] lane = fastq_metadata[fastq_name]['lane'] readgroup_metadata = { 'PL': 'ILLUMINA', 'SM': sample, 'ID': "%s_%s_Lane%d" % (sample, runID, lane) } metadata_str = make_metadata_string(readgroup_metadata) print "bwa-mem on %s and %s" % (os.path.basename(seq1), os.path.basename(seq2)) runStageCheck('bwaMemPE', flag_file, metadata_str, ref_files['bwa_reference'], seq1, seq2, output)
def variants_to_bed(inputs, outputs): variant_fasta, _success = inputs variant_bed, flagFile = outputs runStageCheck('variants_to_bed', flagFile, variant_fasta, variant_bed)