Exemplo n.º 1
0
def callVariants(args):
    """
    Run the Platypus variant-caller, with the specified arguments
    """
    parser = extendedoptparse.OptionParser()

    # Input data and miscellaneous
    parser.add_option("-o", "--output", dest="output",  help="Output SNP data file", action='store', type='string', default="AllVariants.vcf")
    parser.add_option("--refFile",dest="refFile", help="Fasta file of reference. Index must be in same directory", action='store', type='string', required=True)
    parser.add_option("--regions", dest="regions", type="list", help = "region as comma-separated list of chr:start-end, or just list of chr, or nothing", default=None, action = 'store')
    parser.add_option("--skipRegionsFile", dest="skipRegionsFile", type="string", help = "region as comma-separated list of chr:start-end, or just list of chr, or nothing", default=None, action = 'store')
    parser.add_option("--bamFiles", dest="bamFiles", type="list", help = "Comma-delimited list of bam or cram file names", default=None, required=True)
    parser.add_option("--bufferSize", dest="bufferSize", type="int", help = "Data will be buffered in regions of this size", default=100000, required=False)
    parser.add_option("--minReads", dest="minReads", help="Minimum number of supporting reads required before a variant candidate will be considered.", action='store', type='int', default=2)
    parser.add_option("--maxReads", dest="maxReads", help="Maximium coverage in window", action='store', type='float', default=5000000)
    parser.add_option("--verbosity", dest="verbosity", help="Level of logging", action='store', type='int', default=2)
    parser.add_option("--maxReadLength", dest="rlen", help="Maximum read length", action='store', type = 'int', default=150)
    parser.add_option("--logFileName", dest="logFileName", help="Name of log file", action='store', type='string', default="log.txt")
    parser.add_option("--source", dest="sourceFile", help="vcf file(s) to get candidates from", action='store', type='list', default=None)
    parser.add_option("--nCPU", dest="nCPU", help="Number of processors to use", action='store', type='int', default=1)
    parser.add_option("--parseNCBI", dest="parseNCBI", help="", type=int, action='store', default=0)
    parser.add_option("--longHaps", dest="longHaps", help="If this is set to 1, then don't trim replacement variants from input VCFs.", type='int', action='store', default=0)
    parser.add_option("--alignScoreFile", dest="alignScoreFile", help="If this is set to a string, then alignment scores of reads to haplotypes will be writen to this file. This only work when --HLATyping flag is on", type='string', action='store', default="")
    parser.add_option("--HLATyping", dest="HLATyping", help="If this is set to 1, then run HLA genotyping mode which require a source file containing HLA haplotypes", type='int', action='store', default=0)
    parser.add_option("--compressReads", dest="compressReads", help="If this is set to 1, then all reads will be compressed, and decompressd on demand. This will slow things down, but reduce memory usage.", type='int', action='store', default=0)
    parser.add_option("--qualBinSize", dest="qualBinSize", help="This sets the granularity used when compressing quality scores. If > 1 then quality compression is lossy", type='int', action='store', default=1)
    parser.add_option("--fileCaching", dest="fileCaching", help="Sets file caching level. 0: BAM/CRAM files cached. 1: CRAM files cached. 2: No file caching.", type=int, action='store', default=0)

    # Calling Parameters
    parser.add_option("--maxSize", dest="maxSize", help="Largest variant to consider", action='store', type='int', default=1500)
    parser.add_option("--largeWindows", dest="largeWindows", help="If set to 1, window size can be up to 'maxSize'", action='store', type='int', default=0)
    parser.add_option("--maxVariants", dest="maxVariants", help="Maximium variants to consider in a given window", action='store', type='int', default=8)
    parser.add_option("--coverageSamplingLevel", dest="coverageSamplingLevel", help="Downsample to this level of coverage when filtering haplotypes in divergent regions.", action='store', type='int', default=30)
    parser.add_option("--maxHaplotypes", dest="maxHaplotypes", help="Maximium haplotypes to consider in a given window", action='store', type='int', default=50)
    parser.add_option("--skipDifficultWindows", dest="skipDifficultWindows", help="If set to 1, skip windows with > maxVariants candidates", action='store', type='int', default=0)
    parser.add_option("--getVariantsFromBAMs", dest="getVariantsFromBAMs", help="If set to TRUE (default), variant candidates will be generated from BAMs as well as any other inputs", action='store', type='int', default=1)
    parser.add_option("--genSNPs", dest="genSNPs", help="If set to TRUE (default), SNP candidates will be considered", action='store', type='int', default=1)
    parser.add_option("--genIndels", dest="genIndels", help="If set to TRUE (default), Indel candidates will be considered", action='store', type='int', default=1)
    parser.add_option("--mergeClusteredVariants", dest="mergeClusteredVariants", help="If set to 1, variant-containing windows which are close together will be merged, resulting in slower, more accurate variant calls in diverse regions", action='store', type='int', default=1)
    parser.add_option("--minFlank", dest="minFlank", help="Ignore base-changes closer than minFlank bases to the end of reads. Also, merge SNPs within this distance into MNPs or complex replacements", action='store', type = 'int', default=10)
    parser.add_option("--trimReadFlank", dest="trimReadFlank", help="Set base-qualities to 0 within 'trimReadFlank' bases of the end of reads", action='store', type = 'int', default=0)
    parser.add_option("--filterVarsByCoverage", dest="filterVarsByCoverage", help="If 1, Platypus filters variants in difficult regions by the number of times each variant is seen.", action='store', type='int', default=1)
    parser.add_option("--filteredReadsFrac", dest="filteredReadsFrac", help="If > this fraction of reads are filtered in a given window, the 'badReads filter is triggered.", action='store', type='float', default=0.7)
    parser.add_option("--maxVarDist", dest="maxVarDist", help="Max distance between variants to be considered in the same window", action='store', type='int', default=15) # 9 is 1 base longer than the max possible alignment shift
    parser.add_option("--minVarDist", dest="minVarDist", help="Min distance allowed between windows", action='store', type='int', default=9) # 9 is 1 base longer than the max possible alignment shift
    parser.add_option("--useEMLikelihoods", dest="useEMLikelihoods", help="If 1, likelihoods computed from EM algorithm will be used to call genotypes for each sample, otherwise likelihoods from individual sample will be used.", action='store', type='int', default=0)
    parser.add_option("--countOnlyExactIndelMatches", dest="countOnlyExactIndelMatches", help="If 1, only exactly matching indels will be counted in the NV field", action='store', type='int', default=0)
    parser.add_option("--calculateFlankScore", dest="calculateFlankScore", help="If 1, an additional alignment routine is used to calculate scores from flanks outside windows (EXPERIMENTAL).", action='store', type='int', default=0)
    
    # Assembly parameters
    parser.add_option("--assemble", dest="assemble", help="If 1, Cortex will be used to assemble variant candidates for Platypus to call.", action='store', type='int', default=0)
    parser.add_option("--assembleAll", dest="assembleAll", help="If 1 then Platypus will assemble all regions.'.", action='store', type='int', default=1)
    parser.add_option("--assemblyRegionSize", dest="assemblyRegionSize", help="Size of region to assemble with Cortex", action='store', type='int', default=1500)
    parser.add_option("--assembleBadReads", dest="assembleBadReads", help="If 1, then use filtered 'bad' reads for local assembly", action='store', type='int', default=1)
    parser.add_option("--assemblerKmerSize", dest="assemblerKmerSize", help="Kmer size to use for cortex assembly'.", action='store', type='int', default=15)
    parser.add_option("--assembleBrokenPairs", dest="assembleBrokenPairs", help="If 1, then use broken read pairs for local assembly", action='store', type='int', default=0)
    parser.add_option("--noCycles", dest="noCycles", help="If 1, then don't allow cycles in the graph", action='store', type='int', default=0)

    # QC Parameters
    parser.add_option("--minMapQual", dest="minMapQual", help="Minimum mapping quality of read. Any reads with map qual below this are ignored", action='store', type = 'int', default=20, required=False)
    parser.add_option("--minBaseQual", dest="minBaseQual", help="Minimum allowed base-calling quality. Any bases with qual below this are ignored in SNP-calling", action='store', type = 'int', default=20, required=False)
    parser.add_option("--minGoodQualBases", dest="minGoodQualBases", help="Min bases per read that must have base-quality >= 20.", action='store', type = 'int', default=20, required=False)
    parser.add_option("--filterDuplicates", dest="filterDuplicates", help="If set to 1, duplicate reads will be removed based on the read-pair start and end", action='store', type = 'int', default=1, required=False)
    parser.add_option("--filterReadsWithUnmappedMates", dest="filterReadsWithUnmappedMates", help="If set to 1, reads with un-mapped mates will be removed", action='store', type = 'int', default=1, required=False)
    parser.add_option("--filterReadsWithDistantMates", dest="filterReadsWithDistantMates", help="If set to 1, reads with mates mapped far away will be removed", action='store', type = 'int', default=1, required=False)
    parser.add_option("--filterReadPairsWithSmallInserts", dest="filterReadPairsWithSmallInserts", help="If set to 1, read pairs with insert sizes < one read length will be removed", action='store', type = 'int', default=1, required=False)
    parser.add_option("--trimOverlapping", dest="trimOverlapping", help="If set to 1, overlapping paired reads have overlap set to qual 0", action='store', type = 'int', default=1, required=False)
    parser.add_option("--trimAdapter", dest="trimAdapter", help="If set to 1, then sets to qual 0 any part of read which exceeds the mapped fragment length. This is mainly useful for trimming adapter sequences", action='store', type = 'int', default=1, required=False)
    parser.add_option("--trimSoftClipped", dest="trimSoftClipped", help="If set to 1, then sets to qual 0 any soft clipped parts of the read.", action='store', type = 'int', default=1, required=False)

    # Variant-calling Filter Parameters
    parser.add_option("--maxGOF", dest="maxGOF", help="Max allowed value for goodness-of-fit test. Higher than this triggers GOF filter (Phred-scaled).", action='store', type='int', default=30)
    parser.add_option("--minPosterior", dest="minPosterior", help="Only variants with posterior >= this will be outpu to the VCF. Value is a Phred-score.", action='store', type='int', default=5)
    parser.add_option("--sbThreshold", dest="sbThreshold", help="P-value for strand-bias filtering..", action='store', type='float', default=1e-3)
    parser.add_option("--scThreshold", dest="scThreshold", help="Cut-off for SC filter.", action='store', type='float', default=0.95)
    parser.add_option("--abThreshold", dest="abThreshold", help="P-value for allele-bias filtering..", action='store', type='float', default=1e-3)
    parser.add_option("--minVarFreq", dest="minVarFreq", help="Variants below this frequency will be flagged as allele-biased", action='store', type='float', default=0.05)
    parser.add_option("--badReadsWindow", dest="badReadsWindow", help="Size of window around variant to look for low-quality bases.", action='store', type='int', default=11)
    parser.add_option("--badReadsThreshold", dest="badReadsThreshold", help="Variants where the median minimum quality in a window of badReadsWindow around the variant position falls below this value will be filtered with the flag 'badReads'.", action='store', type='int', default=15)
    parser.add_option("--rmsmqThreshold", dest="rmsmqThreshold", help="RMSMQ filter triggers when root-mean-square mapping quality across region containing variant is below this.", action='store', type='int', default=40)
    parser.add_option("--qdThreshold", dest="qdThreshold", help="QD filter triggers quality/depth for variant is below this.", action='store', type='int', default=10)
    parser.add_option("--hapScoreThreshold", dest="hapScoreThreshold", help="HapScore filter triggers HapScore for variant is above this.", action='store', type='int', default=4)

    # Genome VCF parameters
    parser.add_option("--outputRefCalls", dest="outputRefCalls", help="If 1, output block reference calls.", action='store', type='int', default=0)
    parser.add_option("--refCallBlockSize", dest="refCallBlockSize", help="Max size of reference call block.", action='store', type='int', default=1000)

    (options, args) = parser.parse_args(args)
    runVariantCaller(options)
Exemplo n.º 2
0
def continueCalling(args):
    """
    This function allows the user to re-start Platypus from the partially completed output of
    a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus
    then picks up all the options for the previous job from the VCF header, and restarts calling from the latest
    sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF).
    """
    # Create a logger
    logger = logging.getLogger("ATemporaryLog")
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    ch.setLevel(logging.DEBUG)
    logger.setLevel(logging.DEBUG)


    # Seed the Python random number generator
    random.seed("Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be.")
    parser = extendedoptparse.OptionParser()
    parser.add_option("--vcfFile", dest="vcfFile", help="Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string')
    (options, args) = parser.parse_args(args)

    newOutputFileName = options.vcfFile. replace(".vcf", "_ContinuedFromFailedProcess.vcf")

    logger.info("Platypus will now attempt to finish running a failed process, from the VCF output in file %s" %(options.vcfFile))
    logger.info("Complete output (old + new) will go to file %s" %(newOutputFileName))

    theVCF = Open(options.vcfFile, 'r')
    lastLine = None
    platypusOptions = None

    for line in theVCF:

        if "platypusOptions=" in line:
            platypusOptions = parsePlatypusOptionsFromVCFHeader(line)

        lastLine = line

    if platypusOptions is None:
        logger.error("Could not parse old platypus options from VCF %s" %(options.vcfFile))
        logger.error("Check that VCF file is a valid platypus output file")
        logger.error("Quitting now.")
        return

    cols = lastLine.strip().split("\t")

    lastChrom = cols[0]
    realLastPos = int(cols[1]) - 1
    lastPos = (realLastPos//platypusOptions.bufferSize)*platypusOptions.bufferSize

    if platypusOptions.nCPU != 1:
        logger.error("Platypus can only currently continue from single process jobs")
        logger.error("The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1).")
        logger.error("Quitting now.")

    logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %(lastChrom,realLastPos,lastChrom,lastPos))
    allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort)
    theIndex = -1

    for index,region in enumerate(allRegions):
        if region[0] == lastChrom and region[2] == lastPos:
            theIndex = index + 1

    if theIndex == -1:
        raise StandardError, "Could not find region which was unfinished in input VCF"

    logger.info("Platypus will continue calling. Output will go to file %s." %(options.vcfFile))

    doneRegions = allRegions[:theIndex]
    doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom])

    # Reset input VCF file
    theVCF.seek(0,0)

    # Make new file to store complete output
    outputVCF = Open(newOutputFileName, "w")

    # Copy old, unfinished VCF into new VCF
    for line in theVCF:

        if line[0] == "#":
            outputVCF.write(line)
        else:
            cols = line.split("\t")
            chrom = cols[0]
            pos = int(cols[1]) - 1

            if chrom in doneChroms:
                outputVCF.write(line)

            elif chrom == lastChrom and pos < lastPos:
                outputVCF.write(line)

            else:
                break

    outputVCF.close()
    setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:])
    platypusOptions.output = newOutputFileName
    runVariantCaller(platypusOptions, continuing=True)