Пример #1
0
    def __init__(self,params) :

        cleanPyEnv()

        self.params=params

        # normalize boolean option input:
        safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInGermlineCallingModes")
        safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInCancerCallingModes")
        safeSetBool(self.params,"useOverlapPairEvidence")

        # Use RNA option for minCandidate size
        if self.params.isRNA:
            self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize

        # format bam lists:
        if self.params.normalBamList is None : self.params.normalBamList = []
        if self.params.tumorBamList is None : self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
        self.params.evidenceDir=os.path.join(self.params.resultsDir,"evidence")
        ensureDir(self.params.evidenceDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)
        # determine subset of chroms where we can skip calling entirely
        (self.params.callRegionList, self.params.chromIsSkipped) = getCallRegions(self.params)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)

        # always use overlapping pairs for RNA calling
        if (self.params.isRNA) :
            self.params.useOverlapPairEvidence = True
Пример #2
0
def main() :

    import subprocess

    (options,args) = getOptions()

    checkDir(libexecDir)

    samtoolsBin = os.path.join(libexecDir,'samtools')
    checkFile(samtoolsBin,"samtools binary")

    chromData = {}
    chromList = []

    if True :
        cmd = samtoolsBin + " idxstats " + options.bamFile
        proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
        for line in proc.stdout :
            word = line.strip().split('\t')
            if word[0] == "*" : continue

            chromData[word[0]] = (int(word[1]),int(word[2]))
            chromList.append(word[0])


    length = 0
    count = 0

    if True :
        import re, signal

        # match any cigar with a single match sequence:
        matchRex = re.compile("^([0-9]+)M$")

        cmd = samtoolsBin + " view -F 4 -s 0.1 " + options.bamFile
        proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
        for line in proc.stdout :
            word = line.strip().split('\t',7)
            mr = matchRex.match(word[5])
            if mr is None : continue
            length += int(mr.group(1))
            count += 1
            if count >= 200000 : break

        # done with subprocess:
        os.kill(proc.pid, signal.SIGINT)


    if count <= 100000 :
        raise Exception("Unexpected read length approximation results")


    outfp=sys.stdout

    avg_length = float(length)/float(count)

    for chrom in chromList :
        if chromData[chrom][0] < avg_length : continue
        depth = chromData[chrom][1]*avg_length / float(chromData[chrom][0])
        outfp.write("%s\t%.3f\t%s\t%.3f\n" % (chrom, depth, count, avg_length))
Пример #3
0
def getOptions() :

    from optparse import OptionParser

    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)

    parser.add_option("--in", type="string",dest="inFiles",metavar="FILE", action="append",
                      help="input depth filename, argument may be provided more than once to provide all input")
    parser.add_option("--out", type="string",dest="outFile",metavar="FILE",
                      help="output depth filename (required)")

    (options,args) = parser.parse_args()

    if len(args) != 0 :
        parser.print_help()
        sys.exit(2)

    # validate input:
    if options.inFiles is None :
        parser.print_help()
        sys.exit(2)

    if options.outFile is None :
        parser.print_help()
        sys.exit(2)

    for inFile in options.inFiles :
        checkFile(inFile,"input depth")

    return (options,args)
Пример #4
0
def getOptions() :

    from optparse import OptionParser

    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)

    parser.add_option("--in", type="string",dest="inFiles",metavar="FILE", action="append",
                      help="input depth filename, argument may be provided more than once to provide all input")
    parser.add_option("--out", type="string",dest="outFile",metavar="FILE",
                      help="output depth filename (required)")

    (options,args) = parser.parse_args()

    if len(args) != 0 :
        parser.print_help()
        sys.exit(2)

    # validate input:
    if options.inFiles is None :
        parser.print_help()
        sys.exit(2)

    if options.outFile is None :
        parser.print_help()
        sys.exit(2)

    for inFile in options.inFiles :
        checkFile(inFile,"input depth")

    return (options,args)
Пример #5
0
def getOptions():

    from optparse import OptionParser

    usage = "usage: %prog [options] > depth.txt"
    parser = OptionParser(usage=usage)

    parser.add_option("--bam",
                      type="string",
                      dest="bamFile",
                      help="specify bam file for depth estimation (required)")

    (options, args) = parser.parse_args()

    if len(args) != 0:
        parser.print_help()
        sys.exit(2)

    # validate input:
    if options.bamFile is None:
        parser.print_help()
        sys.exit(2)

    checkFile(options.bamFile, "input bam")

    return (options, args)
Пример #6
0
    def __init__(self, params, iniSections):

        cleanPyEnv()

        self.params = params
        self.iniSections = iniSections

        # Use RNA option for minCandidate size
        if self.params.isRNA:
            self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize

        # format bam lists:
        if self.params.normalBamList is None: self.params.normalBamList = []
        if self.params.tumorBamList is None: self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir = os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir = os.path.join(self.params.runDir, "workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir = os.path.join(self.params.runDir, "results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir = os.path.join(self.params.resultsDir, "stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir = os.path.join(self.params.resultsDir,
                                               "variants")
        ensureDir(self.params.variantsDir)
        self.params.evidenceDir = os.path.join(self.params.resultsDir,
                                               "evidence")
        ensureDir(self.params.evidenceDir)
        #         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
        #         ensureDir(self.params.reportsDir)

        indexRefFasta = self.params.referenceFasta + ".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta, "reference fasta")
            checkFile(indexRefFasta, "reference fasta index")

        # read fasta index
        (self.params.chromOrder,
         self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)
        # determine subset of chroms where we can skip calling entirely
        (self.params.callRegionList,
         self.params.chromIsSkipped) = getCallRegions(self.params)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome
                                              or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)
Пример #7
0
    def __init__(self,params,iniSections) :

        # clear out some potentially destabilizing env variables:
        clearList = [ "PYTHONPATH", "PYTHONHOME"]
        for key in clearList :
            if key in os.environ :
                del os.environ[key]

        self.params=params
        self.iniSections=iniSections

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        self.params.normalBamList = []
        for bam in (self.params.normalBam,) :
            if bam is None : continue
            self.params.normalBamList.append(bam)

        self.params.tumorBamList = []
        for bam in (self.params.tumorBam,) :
            if bam is None : continue
            self.params.tumorBamList.append(bam)

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)

        # sanity check some parameter typing:
        self.params.binSize = int(self.params.binSize)
        self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins)

        self.paths = PathInfo(self.params)
Пример #8
0
    def __init__(self,params,iniSections) :

        # clear out some potentially destabilizing env variables:
        clearList = [ "PYTHONPATH", "PYTHONHOME"]
        for key in clearList :
            if key in os.environ :
                del os.environ[key]

        self.params=params
        self.iniSections=iniSections

        # format bam lists:
        if self.params.normalBamList is None : self.params.normalBamList = []
        if self.params.tumorBamList is None : self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)

        # sanity check some parameter typing:
        MEGABASE = 1000000
        self.params.scanSize = int(self.params.scanSizeMb) * MEGABASE
        self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)
Пример #9
0
    def __init__(self,params,iniSections) :

        cleanPyEnv()

        self.params=params
        self.iniSections=iniSections

        # format bam lists:
        if self.params.normalBamList is None : self.params.normalBamList = []
        if self.params.tumorBamList is None : self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)
Пример #10
0
    def __init__(self, params, PathInfoType):

        cleanPyEnv()

        self.params = params

        # make sure run directory is setup:
        self.params.runDir = os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir = os.path.join(self.params.runDir, "workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transferred to resultsDir
        self.params.resultsDir = os.path.join(self.params.runDir, "results")
        ensureDir(self.params.resultsDir)
        self.params.variantsDir = os.path.join(self.params.resultsDir,
                                               "variants")
        ensureDir(self.params.variantsDir)

        # timings and other stats go into statsDir
        self.params.statsDir = os.path.join(self.params.resultsDir, "stats")
        ensureDir(self.params.statsDir)

        self.paths = PathInfoType(self.params)

        referenceFastaIndex = self.params.referenceFasta + ".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta, "reference fasta")
            checkFile(referenceFastaIndex, "reference fasta index")

        # read fasta index
        (self.params.chromOrder,
         self.params.chromSizes) = getFastaChromOrderSize(referenceFastaIndex)

        # determine subset of chroms where we can skip calling entirely
        self.params.chromIsSkipped = getChromIsSkipped(self)

        self.params.isHighDepthFilter = (not (self.params.isExome
                                              or self.params.isRNA))
Пример #11
0
def getOptions():

    from optparse import OptionParser

    usage = "usage: %prog [options] > depth.txt"
    parser = OptionParser(usage=usage)

    parser.add_option("--bam", type="string", dest="bamFile", help="specify bam file for depth estimation (required)")

    (options, args) = parser.parse_args()

    if len(args) != 0:
        parser.print_help()
        sys.exit(2)

    # validate input:
    if options.bamFile is None:
        parser.print_help()
        sys.exit(2)

    checkFile(options.bamFile, "input bam")

    return (options, args)
Пример #12
0
def main() :

    import subprocess

    (options,args) = getOptions()

    checkDir(libexecDir)

    samtoolsBin = os.path.join(libexecDir,'samtools')
    checkFile(samtoolsBin,"samtools binary")

    chromData = {}
    chromList = []

    if True :
        cmd = samtoolsBin + " idxstats '%s'" % (options.bamFile)
        proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
        for line in proc.stdout :
            word = line.strip().split('\t')
            if word[0] == "*" : continue

            chromData[word[0]] = (int(word[1]),int(word[2]))
            chromList.append(word[0])

    min_count = 100000

    length = 0
    count = 0
    record_count = 0

    # In a first pass, attempt to subsample the genome. If this turns out to be a tiny sample,
    # then go back and run without subsampling the bam
    #
    for type in ('subsample','all') :
        import re, signal

        length = 0
        count = 0
        record_count = 0

        # match any cigar with a series of match sequences:
        matchRex = re.compile("([0-9]+)[M=X]")

        # use "-F 4" to filter out unmapped reads
        cmd = samtoolsBin + " view -F 4"

        # use "-s 0.1" to subsample the bam records to increaase sampled read diversity
        if type == 'subsample' :
            cmd += " -s 0.1"

        cmd += " '%s'" % (options.bamFile)

        proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
        for line in proc.stdout :
            record_count += 1
            word = line.strip().split('\t',7)
            isFound = False
            for mr in re.finditer(matchRex, word[5]) :
                length += int(mr.group(1))
                isFound = True
            if not isFound : continue
            count += 1
            if count >= (min_count*2) : break

        # done with subprocess:
        os.kill(proc.pid, signal.SIGINT)

        if count > min_count : break


    if count <= min_count :
        raise Exception("Unexpected read length approximation results. Observation count: " + str(count) + " Bam record count: " + str(record_count) )


    outfp=sys.stdout

    avg_length = float(length)/float(count)

    for chrom in chromList :
        if chromData[chrom][0] < avg_length : continue
        depth = chromData[chrom][1]*avg_length / float(chromData[chrom][0])
        outfp.write("%s\t%.3f\t%s\t%.3f\n" % (chrom, depth, count, avg_length))
Пример #13
0
def main() :

    import subprocess

    (options,args) = getOptions()

    checkDir(libexecDir)

    samtoolsBin = os.path.join(libexecDir,'samtools')
    checkFile(samtoolsBin,"samtools binary")

    chromData = {}
    chromList = []

    if True :
        cmd = samtoolsBin + " idxstats '%s'" % (options.bamFile)
        proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
        for line in proc.stdout :
            word = line.strip().split('\t')
            if word[0] == "*" : continue

            chromData[word[0]] = (int(word[1]),int(word[2]))
            chromList.append(word[0])

    min_count = 100000

    length = 0
    count = 0
    record_count = 0

    # In a first pass, attempt to subsample the genome. If this turns out to be a tiny sample,
    # then go back and run without subsampling the bam
    #
    for type in ('subsample','all') :
        import re, signal

        length = 0
        count = 0
        record_count = 0

        # match any cigar with a series of match sequences:
        matchRex = re.compile("([0-9]+)[M=X]")

        # use "-F 4" to filter out unmapped reads
        cmd = samtoolsBin + " view -F 4"

        # use "-s 0.1" to subsample the bam records to increaase sampled read diversity
        if type == 'subsample' :
            cmd += " -s 0.1"

        cmd += " '%s'" % (options.bamFile)

        proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
        for line in proc.stdout :
            record_count += 1
            word = line.strip().split('\t',7)
            isFound = False
            for mr in re.finditer(matchRex, word[5]) :
                length += int(mr.group(1))
                isFound = True
            if not isFound : continue
            count += 1
            if count >= (min_count*2) : break

        # done with subprocess:
        os.kill(proc.pid, signal.SIGINT)

        if count > min_count : break


    #if count <= min_count :
    #    raise Exception("Unexpected read length approximation results. Observation count: " + str(count) + " Bam record count: " + str(record_count) )


    outfp=sys.stdout

    avg_length = 0
    if count != 0 :
        avg_length = float(length)/float(count)

    for chrom in chromList :
        depth = 0
        if (chromData[chrom][0] >= avg_length) and (chromData[chrom][0] != 0) :
            depth = chromData[chrom][1]*avg_length / float(chromData[chrom][0])
        outfp.write("%s\t%.3f\t%s\t%.3f\n" % (chrom, depth, count, avg_length))