Пример #1
0
def mergeVCFFiles(tempFileNames, finalFileName, log):
    """
    """
    log.info("Merging output VCF file(s) into final file %s" %(finalFileName))

    # Final output file
    if finalFileName == "-":
        outputVCF = sys.stdout
    else:
        outputVCF = open(finalFileName, 'wb')
    theHeap = []

    # Initialise queue
    for index, fileName in enumerate(tempFileNames):
        theFile = open(fileName, 'rb')

        for line in theFile:

            # End of this file
            if line[0] == "#":
                if index == 0:
                    outputVCF.write(line)
                else:
                    continue
            else:
                theFileForQueueing = FileForQueueing(theFile, line)
                heapq.heappush(theHeap, theFileForQueueing)
                break
        # If there are no calls in the temp file, we still want to
        # remove it.
        else:
            theFile.close()
            os.remove(fileName)

    log.info("%d variants found" %(len(theHeap)))
    # Merge-sort the output using a priority queue
    while len(theHeap) != 0:

        # Get file from heap in right order
        nextFile = heapq.heappop(theHeap)
        outputVCF.write(nextFile.line)

        # Put file back on heap
        try:
            nextFile.next()
            heapq.heappush(theHeap, nextFile)
        except StopIteration:
            continue

    # Close final output file
    if finalFileName != "-":
        outputVCF.close()
    log.info("Finished merging VCF file(s)")
Пример #2
0
def mergeVCFFiles(tempFileNames, finalFileName, log):
    """
    """
    log.info("Merging output VCF file(s) into final file %s" % (finalFileName))

    # Final output file
    outputVCF = open(finalFileName, 'wb')
    theHeap = []

    # Initialise queue
    for index, fileName in enumerate(tempFileNames):
        theFile = open(fileName, 'rb')

        for line in theFile:

            # End of this file
            if line[0] == "#":
                if index == 0:
                    outputVCF.write(line)
                else:
                    continue
            else:
                theFileForQueueing = FileForQueueing(theFile, line)
                heapq.heappush(theHeap, theFileForQueueing)
                break
        # If there are no calls in the temp file, we still want to
        # remove it.
        else:
            theFile.close()
            os.remove(fileName)

    # Merge-sort the output using a priority queue
    while len(theHeap) != 0:

        # Get file from heap in right order
        nextFile = heapq.heappop(theHeap)
        outputVCF.write(nextFile.line)

        # Put file back on heap
        try:
            nextFile.next()
            heapq.heappush(theHeap, nextFile)
        except StopIteration:
            continue

    # Close final output file
    outputVCF.close()
    log.info("Finished merging VCF file(s)")
Пример #3
0
def continueCalling(args):
    """
    This function allows the user to re-start Platypus from the partially completed output of
    a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus
    then picks up all the options for the previous job from the VCF header, and restarts calling from the latest
    sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF).
    """
    # Create a logger
    logger = logging.getLogger("ATemporaryLog")
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    ch.setLevel(logging.DEBUG)
    logger.setLevel(logging.DEBUG)


    # Seed the Python random number generator
    random.seed("Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be.")
    parser = extendedoptparse.OptionParser()
    parser.add_option("--vcfFile", dest="vcfFile", help="Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string')
    (options, args) = parser.parse_args(args)

    newOutputFileName = options.vcfFile. replace(".vcf", "_ContinuedFromFailedProcess.vcf")

    logger.info("Platypus will now attempt to finish running a failed process, from the VCF output in file %s" %(options.vcfFile))
    logger.info("Complete output (old + new) will go to file %s" %(newOutputFileName))

    theVCF = open(options.vcfFile, 'r')
    lastLine = None
    platypusOptions = None

    for line in theVCF:

        if "platypusOptions=" in line:
            platypusOptions = parsePlatypusOptionsFromVCFHeader(line)

        lastLine = line

    if platypusOptions is None:
        logger.error("Could not parse old platypus options from VCF %s" %(options.vcfFile))
        logger.error("Check that VCF file is a valid platypus output file")
        logger.error("Quitting now.")
        return

    cols = lastLine.strip().split("\t")

    lastChrom = cols[0]
    realLastPos = int(cols[1]) - 1
    lastPos = (realLastPos//platypusOptions.bufferSize)*platypusOptions.bufferSize

    if platypusOptions.nCPU != 1:
        logger.error("Platypus can only currently continue from single process jobs")
        logger.error("The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1).")
        logger.error("Quitting now.")

    logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %(lastChrom,realLastPos,lastChrom,lastPos))
    allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort)
    theIndex = -1

    for index,region in enumerate(allRegions):
        if region[0] == lastChrom and region[2] == lastPos:
            theIndex = index + 1

    if theIndex == -1:
        raise StandardError, "Could not find region which was unfinished in input VCF"

    logger.info("Platypus will continue calling. Output will go to file %s." %(options.vcfFile))

    doneRegions = allRegions[:theIndex]
    doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom])

    # Reset input VCF file
    theVCF.seek(0,0)

    # Make new file to store complete output
    outputVCF = open(newOutputFileName, "w")

    # Copy old, unfinished VCF into new VCF
    for line in theVCF:

        if line[0] == "#":
            outputVCF.write(line)
        else:
            cols = line.split("\t")
            chrom = cols[0]
            pos = int(cols[1]) - 1

            if chrom in doneChroms:
                outputVCF.write(line)

            elif chrom == lastChrom and pos < lastPos:
                outputVCF.write(line)

            else:
                break

    outputVCF.close()
    setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:])
    platypusOptions.output = newOutputFileName
    runVariantCaller(platypusOptions, continuing=True)
Пример #4
0
def continueCalling(args):
    """
    This function allows the user to re-start Platypus from the partially completed output of
    a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus
    then picks up all the options for the previous job from the VCF header, and restarts calling from the latest
    sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF).
    """
    # Create a logger
    logger = logging.getLogger("ATemporaryLog")
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    ch.setLevel(logging.DEBUG)
    logger.setLevel(logging.DEBUG)

    # Seed the Python random number generator
    random.seed(
        "Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be."
    )
    parser = extendedoptparse.OptionParser()
    parser.add_option(
        "--vcfFile",
        dest="vcfFile",
        help=
        "Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus",
        action='store',
        type='string')
    (options, args) = parser.parse_args(args)

    newOutputFileName = options.vcfFile.replace(
        ".vcf", "_ContinuedFromFailedProcess.vcf")

    logger.info(
        "Platypus will now attempt to finish running a failed process, from the VCF output in file %s"
        % (options.vcfFile))
    logger.info("Complete output (old + new) will go to file %s" %
                (newOutputFileName))

    theVCF = open(options.vcfFile, 'r')
    lastLine = None
    platypusOptions = None

    for line in theVCF:

        if "platypusOptions=" in line:
            platypusOptions = parsePlatypusOptionsFromVCFHeader(line)

        lastLine = line

    if platypusOptions is None:
        logger.error("Could not parse old platypus options from VCF %s" %
                     (options.vcfFile))
        logger.error("Check that VCF file is a valid platypus output file")
        logger.error("Quitting now.")
        return

    cols = lastLine.strip().split("\t")

    lastChrom = cols[0]
    realLastPos = int(cols[1]) - 1
    lastPos = (realLastPos //
               platypusOptions.bufferSize) * platypusOptions.bufferSize

    if platypusOptions.nCPU != 1:
        logger.error(
            "Platypus can only currently continue from single process jobs")
        logger.error(
            "The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1)."
        )
        logger.error("Quitting now.")

    logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %
                (lastChrom, realLastPos, lastChrom, lastPos))
    allRegions = sorted(platypusutils.getRegions(platypusOptions),
                        cmp=regionSort)
    theIndex = -1

    for index, region in enumerate(allRegions):
        if region[0] == lastChrom and region[2] == lastPos:
            theIndex = index + 1

    if theIndex == -1:
        raise StandardError, "Could not find region which was unfinished in input VCF"

    logger.info("Platypus will continue calling. Output will go to file %s." %
                (options.vcfFile))

    doneRegions = allRegions[:theIndex]
    doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom])

    # Reset input VCF file
    theVCF.seek(0, 0)

    # Make new file to store complete output
    outputVCF = open(newOutputFileName, "w")

    # Copy old, unfinished VCF into new VCF
    for line in theVCF:

        if line[0] == "#":
            outputVCF.write(line)
        else:
            cols = line.split("\t")
            chrom = cols[0]
            pos = int(cols[1]) - 1

            if chrom in doneChroms:
                outputVCF.write(line)

            elif chrom == lastChrom and pos < lastPos:
                outputVCF.write(line)

            else:
                break

    outputVCF.close()
    setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:])
    platypusOptions.output = newOutputFileName
    runVariantCaller(platypusOptions, continuing=True)