def realignIntervals(inputs, outputs):
    """
    Run GATK RealignTargetCreator to find suspect intervals for realignment.
    """
    bam, _success = inputs
    output_intervals, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.realignIntervals.log')
    print "calculating realignment intervals for %s" % os.path.basename(bam)
    runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], bam, ref_files['indels_realign_goldstandard'], ref_files['indels_realign_1000G'], logFile, output_intervals)
示例#2
0
def getEnsemblAnnotations(inputs, outputs):
    """
    Annotate vcf using ENSEMBL variant effect predictor.
    """
    vcf, _idx, _success = inputs
    output, flag_file = outputs
    logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log')
    print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(vcf)
    runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
示例#3
0
def filterIndels(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw INDEL calls.
    """
    input_vcf, _idx, _success = inputs
    output_vcf, _idxout, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log')
    print "filtering indels from %s" % input_vcf
    runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf)
示例#4
0
def freebayes1(inputs,outputs):
    """
    First run of Freebayes, i.e. before Base Quality Score Recalibration
    """
    bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs
    output_vcf, flag_file = outputs
    logFile = mkLogFile(logDir, bam1, '.freebayes.log')
    print "call variants using FreeBayes: %s" % os.path.basename(bam1)
    runStageCheck('freebayes1', flag_file, bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, ref_files['fasta_reference'],  output_vcf)
示例#5
0
def dedup(inputs, outputs):
    """
    Remove apparent duplicates from merged bams using Picard MarkDuplicates.
    """
    input_bam, _success = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.dedup.log')
    print "de-duping %s" % os.path.basename(input_bam)
    runStageCheck('dedup', flag_file, input_bam, logFile, output_bam)
示例#6
0
def realignIntervals(inputs, outputs):
    """
    Run GATK RealignTargetCreator to find suspect intervals for realignment. 
    """
    input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23 = inputs
    output_intervals, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam0, '.realignIntervals.log')
    print "calculating realignment intervals for %s" % os.path.basename(input_bam0)
    runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23, logFile, output_intervals)
示例#7
0
def dedup(inputs, outputs):
    """
    Remove apparent duplicates from merged bams using Picard MarkDuplicates.
    """
    input_bam, _success = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.dedup.log')
    print "de-duping %s" % os.path.basename(input_bam)
    runStageCheck('dedup', flag_file, input_bam, logFile, output_bam)
示例#8
0
def mpileuptosync(inputs,outputs):
    """
    Convert mpileup to sync format for use in Popoolation2
    """
    input_mpileup = inputs
    output_sync, flag_file = outputs
    logFile = mkLogFile(logDir, input_mpileup, '.sync.log')
    print "convert from mpileup to sync format: %s" % os.path.basename(input_mpileup)
    runStageCheck('mpileuptosync', flag_file, input_mpileup, output_sync)    
示例#9
0
def mpileup(inputs,outputs):
    """
    Mpileup  of dedupped, realigned bams - using samtools
    """
    bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs
    output_mpileup, flag_file = outputs
    logFile = mkLogFile(logDir, bam0, '.mpileup.log')
    print "Make mpileup using Samtools: %s and other 9 files" % os.path.basename(bam0)
    runStageCheck('mpileup', flag_file, ref_files['masked_reference'], bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, output_mpileup)    
示例#10
0
def baseQualRecalCount(inputs, outputs):
    """
    GATK CountCovariates, first step of base quality score recalibration.
    """
    bam, _success = inputs
    output_csv, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log')
    print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(bam)
    runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv)
示例#11
0
def cmh2gwas(inputs, outputs):
    """
    Convert the results of the CMH test to GWAS format for viewing in IGV.
    """
    test_results = inputs
    gwas, flag_file = outputs
    logFile = mkLogFile(logDir, test_results, '.gwas.log')
    print "convert CMH results to GWAS: %s" % os.path.basename(test_results)
    runStageCheck('cmh2gwas', flag_file, test_results, gwas)
示例#12
0
def selectHighQualVariants(inputs,outputs):
    """
    Select high quality variants from the initial FreeBayes variant calls to act as input for Base Quality Score Recalibration
    """
    input_vcf = inputs
    output_vcf, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.highqual.log')
    print "select high quality variants using GATK SelectVariants: %s" % os.path.basename(input_vcf)
    runStageCheck('selectHighQualVariants', flag_file, ref_files['fasta_reference'], input_vcf, output_vcf)
示例#13
0
def callHAP(inputs, outputs):
    """
    Use GATK HaplotypeCaller to call SNPs/Indels from recalibrated bams.
    """
    bam, _success = inputs
    output_vcf, _idx, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.callHAPs.log')
    #print "calling haplotypes from %s" % bam
    runStageCheck('callHAP', flag_file, fasta_reference, bam, ref_files['dbsnp'], logFile, output_vcf)
示例#14
0
def callVariantRecalibrator(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw SNP calls.
    """
    input_vcf, _idx, _success = inputs
    output_recal, output_tranches, output_R, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.VarRecal.log')
    print "VariantRecalibrator -> %s" % input_vcf
    runStageCheck('callVariantRecalibrator', flag_file, fasta_reference, input_vcf, ref_files['hapmap'],ref_files['omnimap'], ref_files['1kghc'], ref_files['dbsnp'], output_recal, output_tranches, output_R, logFile)
示例#15
0
def callIndels(inputs, outputs):
    """
    Use GATK UnifiedGenotyper to call indels from recalibrated bams.
    """
    bam, _success = inputs
    output_vcf, _idx, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.callIndels.log')
    print "calling Indels from %s" % bam
    runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf)
示例#16
0
def filterHapVcfs(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw sample HAP calls.
    """
    input_vcf, _idx, _success = inputs
    output_vcf, _idxout, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.filterSNPs.log')
    # print "filtering haplotyper vcf from %s" % input_vcf
    runStageCheck('filterHapVcfs', flag_file, fasta_reference, input_vcf, logFile, output_vcf)
示例#17
0
def filterIndels(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw INDEL calls.
    """
    input_vcf, _idx, _success = inputs
    output_vcf, _idxout, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log')
    print "filtering indels from %s" % input_vcf
    runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'],
                  input_vcf, logFile, output_vcf)
示例#18
0
def baseQualRecal(inputs, outputs):
    """
    GATK BaseRecalibrator, first step of base quality score recalibration.
    'command': "java -Xmx22g -jar /vlsci/VR0245/shared/charlotte-working/programs/GenomeAnalysisTKLite-2.3-9-gdcdccbb/GenomeAnalysisTKLite.jar -T BaseRecalibrator -I %bam -R %ref --knownSites %dbsnp -nt 8 -log %log -o %out"
    """
    bam, _leftAlign_success = inputs
    output_grp, flagFile = outputs
    logFile = mkLogFile(pipeline_options.pipeline['logDir'], bam, '.baseQualRecal.log')
    print "Base Quality recal using GATK for: %s" % bam
    runStageCheck('baseQualRecal', flagFile, bam, fasta_reference, ref_files['dbsnp'], ref_files['indels_realign_goldstandard'], logFile, output_grp)
示例#19
0
def getEnsemblAnnotations(inputs, outputs):
    """
    Annotate vcf using ENSEMBL variant effect predictor.
    """
    vcf, _idx, _success = inputs
    output, flag_file = outputs
    logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log')
    print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(
        vcf)
    runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
示例#20
0
def realign(inputs, outputs):
    """
    Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals.
    Currently this interval file is hard-coded, but it should be possible to include it 'automatically'
    """
    input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9 = inputs
    flag_file = outputs
    logFile = mkLogFile(logDir, input_bam0, '.realign.2.log')
    print "realigning %s" % os.path.basename(input_bam0)
    runStageCheck('realign', flag_file, ref_files['fasta_reference'], input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, logFile)
示例#21
0
def callApplyRecalibration(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw SNP calls.
    """
    #[[input_recal, _VarRecal_success], [input_vcf], [input_recal] , [input_trances]] = inputs
    [input_recal, input_tranches, input_R, _VarRecal_success], input_vcf = inputs
    output_vcf, flag_file = outputs
    logFile = mkLogFile(logDir, input_recal, '.ApplyVarRecal.log')
    print "ApplyRecalibration SNP -> %s" % input_vcf
    runStageCheck('callApplyRecalibration', flag_file, fasta_reference, input_vcf, input_recal, input_tranches, output_vcf, logFile)
示例#22
0
def callIndels(inputs, outputs):
    """
    Use GATK UnifiedGenotyper to call indels from recalibrated bams.
    """
    bam, _success = inputs
    output_vcf, _idx, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.callIndels.log')
    print "calling Indels from %s" % bam
    runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam,
                  ref_files['dbsnp'], logFile, output_vcf)
示例#23
0
def realign(inputs, outputs):
    """
    Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals.
    """
    [intervals, _success], [input_bam] = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.realign.log')
    print "realigning %s" % os.path.basename(input_bam)
    runStageCheck('realign', flag_file, ref_files['fasta_reference'], input_bam, intervals, logFile, output_bam)
    remove_GATK_bai(output_bam)
示例#24
0
def baseQualRecalTabulate(inputs, outputs):
    """
    GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates.
    """
    [input_csv, _success], [input_bam] = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log')
    print "recalibrate base quality scores using GATK on %s" % os.path.basename(input_bam)
    runStageCheck('baseQualRecalTabulate', flag_file, input_bam, ref_files['fasta_reference'], input_csv, logFile, output_bam)
    remove_GATK_bai(output_bam)
示例#25
0
def cmhTest(inputs, outputs):
    """
    Perform a single Cochran-Mantel-Haenzel Test 
    Populations paired in just one arrangement
    """
    sync = inputs
    out, flag_file = outputs
    logFile = mkLogFile(logDir, sync, '.cmh.log')
    print "perform CMH test: %s" % os.path.basename(sync)
    runStageCheck('cmhTest', flag_file, sync, out)
示例#26
0
def realign(inputs, outputs):
    """
    Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals.
    """
    [intervals, _success], [input_bam] = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.realign.log')
    print "realigning %s" % os.path.basename(input_bam)
    runStageCheck('realign', flag_file, ref_files['fasta_reference'],
                  input_bam, intervals, logFile, output_bam)
    remove_GATK_bai(output_bam)
示例#27
0
def baseQualRecalPrint(inputs, outputs):
    """
    GATK TableRecalibration: write reads after base quality scores using the output of baseQualRecal.
    'command': "java -Xmx7g -jar /vlsci/VR0245/shared/charlotte-working/programs/GenomeAnalysisTKLite-2.3-9-gdcdccbb/GenomeAnalysisTKLite.jar -T PrintReads -I %bam -R %ref -BQSR %csvfile -log %log -o %out"
    """
    [[input_grp, _baseQualRecal_success], [input_bam]] = inputs
    output_bam, flagFile = outputs
    logFile = mkLogFile(pipeline_options.pipeline['logDir'], input_bam, '.baseQualRecalTabulate.log')
    print "recalibrate base quality scores using GATK on %s" % input_bam
    runStageCheck('baseQualRecalPrintReads', flagFile, input_bam, fasta_reference, input_grp, logFile, output_bam)
    remove_GATK_bai(output_bam)
示例#28
0
def realignIntervals(inputs, outputs):
    """
    Run GATK RealignTargetCreator to find suspect intervals for realignment.
    """
    bam, _success = inputs
    output_intervals, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.realignIntervals.log')
    print "calculating realignment intervals for %s" % os.path.basename(bam)
    runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'],
                  bam, ref_files['indels_realign_goldstandard'],
                  ref_files['indels_realign_1000G'], logFile, output_intervals)
示例#29
0
def baseQualRecalTabulate(inputs, outputs):
    """
    GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates.
    """
    [input_csv, _success], [input_bam] = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log')
    print "recalibrate base quality scores using GATK on %s" % os.path.basename(
        input_bam)
    runStageCheck('baseQualRecalTabulate', flag_file, input_bam,
                  ref_files['fasta_reference'], input_csv, logFile, output_bam)
    remove_GATK_bai(output_bam)
示例#30
0
def getSnpeffAnnotations(inputs, outputs):
    """
    Annotate vcf using snpeff variant effect predictor.
    """
    print "Inputs: %s" % inputs
    vcf, _idx, _success = inputs
    #vcf, _success = inputs
    output, flag_file = outputs
    logFile = mkLogFile(logDir, vcf, '.snpEffAnnotation.log')
    config = working_files['snpeff'] + "snpEff.config"
    # print "Annotating %s with snpeff variant effect predictor" % os.path.basename(vcf)
    runStageCheck('annotateSNPEff', flag_file, working_files['snpeff'], config, vcf, output)
示例#31
0
def baseQualRecalCount(inputs, outputs):
    """
    GATK CountCovariates, first step of base quality score recalibration.
    """
    bam, _success = inputs
    output_csv, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log')
    print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(
        bam)
    runStageCheck('baseQualRecalCount', flag_file, bam,
                  ref_files['fasta_reference'], ref_files['dbsnp'], logFile,
                  output_csv)
示例#32
0
def filterSnpSift(inputs, outputs):
    """
    Filter Recalibrated variants.
    """
    #print "Inputs: %s" % inputs
    #print "Inputs: %s" % outputs
    vcf, _success = inputs
    #output, _idxout, flag_file = outputs
    output, flag_file = outputs
    logFile = mkLogFile(logDir, vcf, '.snpSiftFilter.log')
    # config = working_files['snpeff'] + "snpEff.config"
    # print "Annotating %s with snpeff variant effect predictor" % os.path.basename(vcf)
    #runStageCheck('filterSnpSift', flag_file, working_files['snpeff'], config, vcf, output)
    runStageCheck('filterSnpSift', flag_file, working_files['snpeff'], vcf, output)