def mergeVCFFiles(tempFileNames, finalFileName, log): """ """ log.info("Merging output VCF file(s) into final file %s" %(finalFileName)) # Final output file if finalFileName == "-": outputVCF = sys.stdout else: outputVCF = open(finalFileName, 'wb') theHeap = [] # Initialise queue for index, fileName in enumerate(tempFileNames): theFile = open(fileName, 'rb') for line in theFile: # End of this file if line[0] == "#": if index == 0: outputVCF.write(line) else: continue else: theFileForQueueing = FileForQueueing(theFile, line) heapq.heappush(theHeap, theFileForQueueing) break # If there are no calls in the temp file, we still want to # remove it. else: theFile.close() os.remove(fileName) log.info("%d variants found" %(len(theHeap))) # Merge-sort the output using a priority queue while len(theHeap) != 0: # Get file from heap in right order nextFile = heapq.heappop(theHeap) outputVCF.write(nextFile.line) # Put file back on heap try: nextFile.next() heapq.heappush(theHeap, nextFile) except StopIteration: continue # Close final output file if finalFileName != "-": outputVCF.close() log.info("Finished merging VCF file(s)")
def mergeVCFFiles(tempFileNames, finalFileName, log): """ """ log.info("Merging output VCF file(s) into final file %s" % (finalFileName)) # Final output file outputVCF = open(finalFileName, 'wb') theHeap = [] # Initialise queue for index, fileName in enumerate(tempFileNames): theFile = open(fileName, 'rb') for line in theFile: # End of this file if line[0] == "#": if index == 0: outputVCF.write(line) else: continue else: theFileForQueueing = FileForQueueing(theFile, line) heapq.heappush(theHeap, theFileForQueueing) break # If there are no calls in the temp file, we still want to # remove it. else: theFile.close() os.remove(fileName) # Merge-sort the output using a priority queue while len(theHeap) != 0: # Get file from heap in right order nextFile = heapq.heappop(theHeap) outputVCF.write(nextFile.line) # Put file back on heap try: nextFile.next() heapq.heappush(theHeap, nextFile) except StopIteration: continue # Close final output file outputVCF.close() log.info("Finished merging VCF file(s)")
def continueCalling(args): """ This function allows the user to re-start Platypus from the partially completed output of a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus then picks up all the options for the previous job from the VCF header, and restarts calling from the latest sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF). """ # Create a logger logger = logging.getLogger("ATemporaryLog") formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") ch = logging.StreamHandler() ch.setFormatter(formatter) logger.addHandler(ch) ch.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) # Seed the Python random number generator random.seed("Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be.") parser = extendedoptparse.OptionParser() parser.add_option("--vcfFile", dest="vcfFile", help="Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string') (options, args) = parser.parse_args(args) newOutputFileName = options.vcfFile. replace(".vcf", "_ContinuedFromFailedProcess.vcf") logger.info("Platypus will now attempt to finish running a failed process, from the VCF output in file %s" %(options.vcfFile)) logger.info("Complete output (old + new) will go to file %s" %(newOutputFileName)) theVCF = open(options.vcfFile, 'r') lastLine = None platypusOptions = None for line in theVCF: if "platypusOptions=" in line: platypusOptions = parsePlatypusOptionsFromVCFHeader(line) lastLine = line if platypusOptions is None: logger.error("Could not parse old platypus options from VCF %s" %(options.vcfFile)) logger.error("Check that VCF file is a valid platypus output file") logger.error("Quitting now.") return cols = lastLine.strip().split("\t") lastChrom = cols[0] realLastPos = int(cols[1]) - 1 lastPos = (realLastPos//platypusOptions.bufferSize)*platypusOptions.bufferSize if platypusOptions.nCPU != 1: logger.error("Platypus can only currently continue from single process jobs") logger.error("The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1).") logger.error("Quitting now.") logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %(lastChrom,realLastPos,lastChrom,lastPos)) allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort) theIndex = -1 for index,region in enumerate(allRegions): if region[0] == lastChrom and region[2] == lastPos: theIndex = index + 1 if theIndex == -1: raise StandardError, "Could not find region which was unfinished in input VCF" logger.info("Platypus will continue calling. Output will go to file %s." %(options.vcfFile)) doneRegions = allRegions[:theIndex] doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom]) # Reset input VCF file theVCF.seek(0,0) # Make new file to store complete output outputVCF = open(newOutputFileName, "w") # Copy old, unfinished VCF into new VCF for line in theVCF: if line[0] == "#": outputVCF.write(line) else: cols = line.split("\t") chrom = cols[0] pos = int(cols[1]) - 1 if chrom in doneChroms: outputVCF.write(line) elif chrom == lastChrom and pos < lastPos: outputVCF.write(line) else: break outputVCF.close() setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:]) platypusOptions.output = newOutputFileName runVariantCaller(platypusOptions, continuing=True)
def continueCalling(args): """ This function allows the user to re-start Platypus from the partially completed output of a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus then picks up all the options for the previous job from the VCF header, and restarts calling from the latest sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF). """ # Create a logger logger = logging.getLogger("ATemporaryLog") formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") ch = logging.StreamHandler() ch.setFormatter(formatter) logger.addHandler(ch) ch.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) # Seed the Python random number generator random.seed( "Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be." ) parser = extendedoptparse.OptionParser() parser.add_option( "--vcfFile", dest="vcfFile", help= "Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string') (options, args) = parser.parse_args(args) newOutputFileName = options.vcfFile.replace( ".vcf", "_ContinuedFromFailedProcess.vcf") logger.info( "Platypus will now attempt to finish running a failed process, from the VCF output in file %s" % (options.vcfFile)) logger.info("Complete output (old + new) will go to file %s" % (newOutputFileName)) theVCF = open(options.vcfFile, 'r') lastLine = None platypusOptions = None for line in theVCF: if "platypusOptions=" in line: platypusOptions = parsePlatypusOptionsFromVCFHeader(line) lastLine = line if platypusOptions is None: logger.error("Could not parse old platypus options from VCF %s" % (options.vcfFile)) logger.error("Check that VCF file is a valid platypus output file") logger.error("Quitting now.") return cols = lastLine.strip().split("\t") lastChrom = cols[0] realLastPos = int(cols[1]) - 1 lastPos = (realLastPos // platypusOptions.bufferSize) * platypusOptions.bufferSize if platypusOptions.nCPU != 1: logger.error( "Platypus can only currently continue from single process jobs") logger.error( "The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1)." ) logger.error("Quitting now.") logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" % (lastChrom, realLastPos, lastChrom, lastPos)) allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort) theIndex = -1 for index, region in enumerate(allRegions): if region[0] == lastChrom and region[2] == lastPos: theIndex = index + 1 if theIndex == -1: raise StandardError, "Could not find region which was unfinished in input VCF" logger.info("Platypus will continue calling. Output will go to file %s." % (options.vcfFile)) doneRegions = allRegions[:theIndex] doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom]) # Reset input VCF file theVCF.seek(0, 0) # Make new file to store complete output outputVCF = open(newOutputFileName, "w") # Copy old, unfinished VCF into new VCF for line in theVCF: if line[0] == "#": outputVCF.write(line) else: cols = line.split("\t") chrom = cols[0] pos = int(cols[1]) - 1 if chrom in doneChroms: outputVCF.write(line) elif chrom == lastChrom and pos < lastPos: outputVCF.write(line) else: break outputVCF.close() setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:]) platypusOptions.output = newOutputFileName runVariantCaller(platypusOptions, continuing=True)