def realignIntervals(inputs, outputs): """ Run GATK RealignTargetCreator to find suspect intervals for realignment. """ bam, _success = inputs output_intervals, flag_file = outputs logFile = mkLogFile(logDir, bam, '.realignIntervals.log') print "calculating realignment intervals for %s" % os.path.basename(bam) runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], bam, ref_files['indels_realign_goldstandard'], ref_files['indels_realign_1000G'], logFile, output_intervals)
def getEnsemblAnnotations(inputs, outputs): """ Annotate vcf using ENSEMBL variant effect predictor. """ vcf, _idx, _success = inputs output, flag_file = outputs logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log') print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(vcf) runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
def filterIndels(inputs, outputs): """ Use GATK VariantFiltration to filter raw INDEL calls. """ input_vcf, _idx, _success = inputs output_vcf, _idxout, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log') print "filtering indels from %s" % input_vcf runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf)
def freebayes1(inputs,outputs): """ First run of Freebayes, i.e. before Base Quality Score Recalibration """ bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs output_vcf, flag_file = outputs logFile = mkLogFile(logDir, bam1, '.freebayes.log') print "call variants using FreeBayes: %s" % os.path.basename(bam1) runStageCheck('freebayes1', flag_file, bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, ref_files['fasta_reference'], output_vcf)
def dedup(inputs, outputs): """ Remove apparent duplicates from merged bams using Picard MarkDuplicates. """ input_bam, _success = inputs output_bam, flag_file = outputs logFile = mkLogFile(logDir, input_bam, '.dedup.log') print "de-duping %s" % os.path.basename(input_bam) runStageCheck('dedup', flag_file, input_bam, logFile, output_bam)
def realignIntervals(inputs, outputs): """ Run GATK RealignTargetCreator to find suspect intervals for realignment. """ input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23 = inputs output_intervals, flag_file = outputs logFile = mkLogFile(logDir, input_bam0, '.realignIntervals.log') print "calculating realignment intervals for %s" % os.path.basename(input_bam0) runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23, logFile, output_intervals)
def mpileuptosync(inputs,outputs): """ Convert mpileup to sync format for use in Popoolation2 """ input_mpileup = inputs output_sync, flag_file = outputs logFile = mkLogFile(logDir, input_mpileup, '.sync.log') print "convert from mpileup to sync format: %s" % os.path.basename(input_mpileup) runStageCheck('mpileuptosync', flag_file, input_mpileup, output_sync)
def mpileup(inputs,outputs): """ Mpileup of dedupped, realigned bams - using samtools """ bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs output_mpileup, flag_file = outputs logFile = mkLogFile(logDir, bam0, '.mpileup.log') print "Make mpileup using Samtools: %s and other 9 files" % os.path.basename(bam0) runStageCheck('mpileup', flag_file, ref_files['masked_reference'], bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, output_mpileup)
def baseQualRecalCount(inputs, outputs): """ GATK CountCovariates, first step of base quality score recalibration. """ bam, _success = inputs output_csv, flag_file = outputs logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log') print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(bam) runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv)
def cmh2gwas(inputs, outputs): """ Convert the results of the CMH test to GWAS format for viewing in IGV. """ test_results = inputs gwas, flag_file = outputs logFile = mkLogFile(logDir, test_results, '.gwas.log') print "convert CMH results to GWAS: %s" % os.path.basename(test_results) runStageCheck('cmh2gwas', flag_file, test_results, gwas)
def selectHighQualVariants(inputs,outputs): """ Select high quality variants from the initial FreeBayes variant calls to act as input for Base Quality Score Recalibration """ input_vcf = inputs output_vcf, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.highqual.log') print "select high quality variants using GATK SelectVariants: %s" % os.path.basename(input_vcf) runStageCheck('selectHighQualVariants', flag_file, ref_files['fasta_reference'], input_vcf, output_vcf)
def callHAP(inputs, outputs): """ Use GATK HaplotypeCaller to call SNPs/Indels from recalibrated bams. """ bam, _success = inputs output_vcf, _idx, flag_file = outputs logFile = mkLogFile(logDir, bam, '.callHAPs.log') #print "calling haplotypes from %s" % bam runStageCheck('callHAP', flag_file, fasta_reference, bam, ref_files['dbsnp'], logFile, output_vcf)
def callVariantRecalibrator(inputs, outputs): """ Use GATK VariantFiltration to filter raw SNP calls. """ input_vcf, _idx, _success = inputs output_recal, output_tranches, output_R, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.VarRecal.log') print "VariantRecalibrator -> %s" % input_vcf runStageCheck('callVariantRecalibrator', flag_file, fasta_reference, input_vcf, ref_files['hapmap'],ref_files['omnimap'], ref_files['1kghc'], ref_files['dbsnp'], output_recal, output_tranches, output_R, logFile)
def callIndels(inputs, outputs): """ Use GATK UnifiedGenotyper to call indels from recalibrated bams. """ bam, _success = inputs output_vcf, _idx, flag_file = outputs logFile = mkLogFile(logDir, bam, '.callIndels.log') print "calling Indels from %s" % bam runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf)
def filterHapVcfs(inputs, outputs): """ Use GATK VariantFiltration to filter raw sample HAP calls. """ input_vcf, _idx, _success = inputs output_vcf, _idxout, flag_file = outputs logFile = mkLogFile(logDir, input_vcf, '.filterSNPs.log') # print "filtering haplotyper vcf from %s" % input_vcf runStageCheck('filterHapVcfs', flag_file, fasta_reference, input_vcf, logFile, output_vcf)
def baseQualRecal(inputs, outputs): """ GATK BaseRecalibrator, first step of base quality score recalibration. 'command': "java -Xmx22g -jar /vlsci/VR0245/shared/charlotte-working/programs/GenomeAnalysisTKLite-2.3-9-gdcdccbb/GenomeAnalysisTKLite.jar -T BaseRecalibrator -I %bam -R %ref --knownSites %dbsnp -nt 8 -log %log -o %out" """ bam, _leftAlign_success = inputs output_grp, flagFile = outputs logFile = mkLogFile(pipeline_options.pipeline['logDir'], bam, '.baseQualRecal.log') print "Base Quality recal using GATK for: %s" % bam runStageCheck('baseQualRecal', flagFile, bam, fasta_reference, ref_files['dbsnp'], ref_files['indels_realign_goldstandard'], logFile, output_grp)
def getEnsemblAnnotations(inputs, outputs): """ Annotate vcf using ENSEMBL variant effect predictor. """ vcf, _idx, _success = inputs output, flag_file = outputs logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log') print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename( vcf) runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
def realign(inputs, outputs): """ Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals. Currently this interval file is hard-coded, but it should be possible to include it 'automatically' """ input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9 = inputs flag_file = outputs logFile = mkLogFile(logDir, input_bam0, '.realign.2.log') print "realigning %s" % os.path.basename(input_bam0) runStageCheck('realign', flag_file, ref_files['fasta_reference'], input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, logFile)
def callApplyRecalibration(inputs, outputs): """ Use GATK VariantFiltration to filter raw SNP calls. """ #[[input_recal, _VarRecal_success], [input_vcf], [input_recal] , [input_trances]] = inputs [input_recal, input_tranches, input_R, _VarRecal_success], input_vcf = inputs output_vcf, flag_file = outputs logFile = mkLogFile(logDir, input_recal, '.ApplyVarRecal.log') print "ApplyRecalibration SNP -> %s" % input_vcf runStageCheck('callApplyRecalibration', flag_file, fasta_reference, input_vcf, input_recal, input_tranches, output_vcf, logFile)
def realign(inputs, outputs): """ Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals. """ [intervals, _success], [input_bam] = inputs output_bam, flag_file = outputs logFile = mkLogFile(logDir, input_bam, '.realign.log') print "realigning %s" % os.path.basename(input_bam) runStageCheck('realign', flag_file, ref_files['fasta_reference'], input_bam, intervals, logFile, output_bam) remove_GATK_bai(output_bam)
def baseQualRecalTabulate(inputs, outputs): """ GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates. """ [input_csv, _success], [input_bam] = inputs output_bam, flag_file = outputs logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log') print "recalibrate base quality scores using GATK on %s" % os.path.basename(input_bam) runStageCheck('baseQualRecalTabulate', flag_file, input_bam, ref_files['fasta_reference'], input_csv, logFile, output_bam) remove_GATK_bai(output_bam)
def cmhTest(inputs, outputs): """ Perform a single Cochran-Mantel-Haenzel Test Populations paired in just one arrangement """ sync = inputs out, flag_file = outputs logFile = mkLogFile(logDir, sync, '.cmh.log') print "perform CMH test: %s" % os.path.basename(sync) runStageCheck('cmhTest', flag_file, sync, out)
def baseQualRecalPrint(inputs, outputs): """ GATK TableRecalibration: write reads after base quality scores using the output of baseQualRecal. 'command': "java -Xmx7g -jar /vlsci/VR0245/shared/charlotte-working/programs/GenomeAnalysisTKLite-2.3-9-gdcdccbb/GenomeAnalysisTKLite.jar -T PrintReads -I %bam -R %ref -BQSR %csvfile -log %log -o %out" """ [[input_grp, _baseQualRecal_success], [input_bam]] = inputs output_bam, flagFile = outputs logFile = mkLogFile(pipeline_options.pipeline['logDir'], input_bam, '.baseQualRecalTabulate.log') print "recalibrate base quality scores using GATK on %s" % input_bam runStageCheck('baseQualRecalPrintReads', flagFile, input_bam, fasta_reference, input_grp, logFile, output_bam) remove_GATK_bai(output_bam)
def baseQualRecalTabulate(inputs, outputs): """ GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates. """ [input_csv, _success], [input_bam] = inputs output_bam, flag_file = outputs logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log') print "recalibrate base quality scores using GATK on %s" % os.path.basename( input_bam) runStageCheck('baseQualRecalTabulate', flag_file, input_bam, ref_files['fasta_reference'], input_csv, logFile, output_bam) remove_GATK_bai(output_bam)
def getSnpeffAnnotations(inputs, outputs): """ Annotate vcf using snpeff variant effect predictor. """ print "Inputs: %s" % inputs vcf, _idx, _success = inputs #vcf, _success = inputs output, flag_file = outputs logFile = mkLogFile(logDir, vcf, '.snpEffAnnotation.log') config = working_files['snpeff'] + "snpEff.config" # print "Annotating %s with snpeff variant effect predictor" % os.path.basename(vcf) runStageCheck('annotateSNPEff', flag_file, working_files['snpeff'], config, vcf, output)
def baseQualRecalCount(inputs, outputs): """ GATK CountCovariates, first step of base quality score recalibration. """ bam, _success = inputs output_csv, flag_file = outputs logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log') print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename( bam) runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv)
def filterSnpSift(inputs, outputs): """ Filter Recalibrated variants. """ #print "Inputs: %s" % inputs #print "Inputs: %s" % outputs vcf, _success = inputs #output, _idxout, flag_file = outputs output, flag_file = outputs logFile = mkLogFile(logDir, vcf, '.snpSiftFilter.log') # config = working_files['snpeff'] + "snpEff.config" # print "Annotating %s with snpeff variant effect predictor" % os.path.basename(vcf) #runStageCheck('filterSnpSift', flag_file, working_files['snpeff'], config, vcf, output) runStageCheck('filterSnpSift', flag_file, working_files['snpeff'], vcf, output)