示例#1
0
    def sendBillingInfoRecordsToGratia(self):
        """
        This is the public method for starting the dCache-transfer reporting.

        This will query records no more than _maxAge old, and always starts
        queries on hour time boundaries (i.e., 1:00:00 not 1:02:00).

        This will continue to query until we hit records starting less than 75
        minutes ago, then return.

        By default, we start with querying 60-second intervals, but will shrink
        this window if we encounter lots of data.

        If not summarizing: this method uses _execute to get all the data for
           a given interval, then uses _processResults to send them to Gratia.
           Once the query for a time interval is done, then we immediately
           checkpoint.

        If summarizing: this method continues to query until it hits the end of
           an hour interval.  At that point, it summarizes once again, and sends
           the summaries up to Gratia.  We then only checkpoint on the hour.
        """
        self._log.debug("sendBillingInfoRecordsToGratia")

        # Query no more than a set number of days in the past
        minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0)
        minTime = datetime.datetime(minTime.year, minTime.month, minTime.day,
            minTime.hour, 0, 0)

        # The latest allowed record is 75 minutes in the past, in order to make
        # sure we only query complete intervals
        latestAllowed = datetime.datetime.now() - datetime.timedelta(0, 75*60)

        if ( TestContainer.isTest() ):
           latestAllowed = TestContainer.getEndDateTime()

        # Start with either the last checkpoint or minTime days ago, whichever
        # is more recent.
        starttime = max(self._BIcheckpoint.lastDateStamp(), minTime)
        self._log.info("Starting queries at time %s." % starttime)

        dictRecordAgg = TimeBinRange.DictRecordAggregator(DCACHE_AGG_FIELDS,
            DCACHE_SUM_FIELDS)

        nextSummary = self._determineNextEndtime(starttime, summary=True)
        if self._summarize:
            self._log.debug("Next summary send time: %s." % nextSummary)

        results = []
        endtime = self._determineNextEndtime(starttime)
        totalRecords = 0
        # Loop until we have caught up to latestAllowed.
        while starttime < latestAllowed:
            assert starttime < endtime
            self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \
                'starting at %s.' % starttime)
            # We are guaranteed that starttime will move forward to the value of
            # endtime every time we call execute.
            next_starttime, rows = self._execute(starttime, endtime, self._maxSelect)

	    results += rows
            totalRecords += len(rows)
            if self._summarize:
                # Summarize the partial results
                results = Collapse.collapse(results, dictRecordAgg)
            assert next_starttime > starttime
            next_endtime = self._determineNextEndtime(next_starttime)

            # If we're not summarizing, we send up records each loop.
            if (not self._summarize) and results:
                totalRecords = 0
                # We now have all the rows we want; process them
                self._BIcheckpoint.createPending(endtime, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                if (self._range < STARTING_RANGE and len(results)*4 < \
                       self._maxSelect):
                    self._range = STARTING_RANGE
                results = []
            # If we are summarizing, send records only per hour of data
            elif (next_endtime > nextSummary) and results:
                num_agg = totalRecords - len(results)
                if num_agg:
                    factor = float(totalRecords)/float(len(results))
                    self._log.info("Aggregated %i of %i records for time " \
                        "interval ending in %s.  %.1fx reduction." % \
                        (num_agg, totalRecords, nextSummary, factor))
                else:
                    self._log.debug("Unable to aggregate any of %i records" \
                        % totalRecords)
                totalRecords = 0
                self._BIcheckpoint.createPending(nextSummary, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                results = []
                self._range = STARTING_RANGE

            nextSummary = self._determineNextEndtime(next_starttime,
                summary=True)

            endtime = next_endtime
            starttime = next_starttime

            # Check to see if the stop file has been created.  If so, break
            if os.path.exists(self._stopFileName):
		#Neha - 03/17/2011
	        #Don't need to commit anything since we are only doing select and no inserts or updates
            	self._cur.close()
            	self._connection.close()
	        break
示例#2
0
    return sum

def dumpStatistics(log):
   global recordsToSend
   if ( not TEST ):
      return
   log.info("Send to gratia:")
   dump(log,createStatistics(recordsToSend))

   log.info("Generated:")
   dump(log,createStatistics(BillingRecSimulator.sqlTableContent))
    
def dump(log,(overall,initiator,errorcode,totalRecords)):
   log.info("Overall %s" % overall)
   log.info("initiator %s"% initiator)
   log.info("errorcode %s" % errorcode)
   log.info("num records %s" % totalRecords)


if __name__ == "__main__":

  recordsToSend = BillingRecSimulator.generateTableContent() 
  print "Pre aggregation"
  print createStatistics(recordsToSend)

  recordsToSend = Collapse.collapse(recordsToSend,TimeBinRange.DictRecordAggregator(['initiator','client', 'protocol','errorcode','isnew' ],['njobs','transfersize','connectiontime']))
  print "Post Aggregation"
  print createStatistics(recordsToSend)

示例#3
0
    global recordsToSend
    if (not TEST):
        return
    log.info("Send to gratia:")
    dump(log, createStatistics(recordsToSend))

    log.info("Generated:")
    dump(log, createStatistics(BillingRecSimulator.sqlTableContent))


def dump(log, (overall, initiator, errorcode, totalRecords)):
    log.info("Overall %s" % overall)
    log.info("initiator %s" % initiator)
    log.info("errorcode %s" % errorcode)
    log.info("num records %s" % totalRecords)


if __name__ == "__main__":

    recordsToSend = BillingRecSimulator.generateTableContent()
    print "Pre aggregation"
    print createStatistics(recordsToSend)

    recordsToSend = Collapse.collapse(
        recordsToSend,
        TimeBinRange.DictRecordAggregator(
            ['initiator', 'client', 'protocol', 'errorcode', 'isnew'],
            ['njobs', 'transfersize', 'connectiontime']))
    print "Post Aggregation"
    print createStatistics(recordsToSend)
示例#4
0
def runPipeline(sampleName, sampleDir, cleanup=False):
    """
    Run all scripts in the Dellingr pipeline on the specified sample
    :param sampleName: A string containing the sample name, for status message updates
    :param sampleDir: A string containg the filepath to the base sample directory
    :param cleanup: A boolean indicating if temporary files should be deleted
    :return:
    """
    def runBWA(configPath):
        """
        Aligns the reads in the specified FASTQ files using the Burrows-Wheeler aligner
        In addition, a read group is added, and the resulting BAM file is sorted

        :param configPath: A string containing a filepath to a ini file listing bwa's parameters
        :return: None
        """

        sys.stderr.write("\t".join(
            [printPrefix, time.strftime('%X'), "Running BWA...\n"]))
        # Read the arguments from the config file
        try:
            bwaConfig = ConfigObj(configPath)["bwa"]
        except KeyError:  # Thrown if the section is not labelled "bwa"
            sys.stderr.write(
                "ERROR: The config file \'%s\' does not appear to be a bwa config, as no section is labelled \'bwa\'\n"
                % (configPath))
            exit(1)
        # Parse the arguments from the config file in the required order
        try:
            bwaCommand = [
                bwaConfig["bwa"],
                "mem",
                bwaConfig["reference"],
                bwaConfig["input"][0],
                bwaConfig["input"][1],
            ]
            if bwaConfig[
                    "fastqComment"] == "True":  # We need to append the barcode sequence to the output BAM file
                bwaCommand.insert(2, "-C")
            sortCommand = [
                bwaConfig["samtools"], "sort", "-O", "BAM", "-o",
                bwaConfig["output"]
            ]

            # To supress BWA's status messages, we are going to buffer the stderr stream of every process into a variable
            # If BWA or a samtools task crashes (exit code != 0), we will print out everything that is buffered
            bwaStderr = []
            samtoolsStderr = []
            bwaCounter = 0

            try:
                bwaCom = subprocess.Popen(bwaCommand,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
                sortCom = subprocess.Popen(sortCommand,
                                           stdin=bwaCom.stdout,
                                           stderr=subprocess.PIPE)

                # Parse through the stderr lines of BWA and samtools, and buffer them as necessary
                for bwaLine in bwaCom.stderr:
                    # If this line indicates the progress of BWA, print it out
                    bwaLine = bwaLine.decode("utf-8")
                    if bwaLine.startswith("[M::mem_process_seqs]"):
                        bwaCounter += int(bwaLine.split(" ")[2])
                        sys.stderr.write("\t".join([
                            printPrefix,
                            time.strftime('%X'),
                            "Reads Processed:" + str(bwaCounter) + "\n"
                        ]))
                    bwaStderr.append(bwaLine)

                for samtoolsLine in sortCom.stderr:
                    samtoolsStderr.append(samtoolsLine.decode("utf-8"))

                bwaCom.stdout.close()
                bwaCom.wait()
                sortCom.wait()
                if bwaCom.returncode != 0 or sortCom.returncode != 0:  # i.e. Something crashed
                    raise subprocess.CalledProcessError()
            except BaseException as e:  # Either a program crashed, or something is hanging and the user has force quit
                # To be safe, print out debugging info
                sys.stderr.write(
                    "ERROR: BWA and Samtools encountered an unexpected error and were terminated"
                    + os.linesep)
                sys.stderr.write("BWA Standard Error Stream:" + os.linesep)
                sys.stderr.write("".join(bwaStderr))
                sys.stderr.write("Samtools Sort Standard Error Stream:" +
                                 os.linesep)
                sys.stderr.write("".join(samtoolsStderr))
                raise e

            sys.stderr.write("\t".join(
                [printPrefix,
                 time.strftime('%X'), "Mapping Complete\n"]))

        except KeyError as e:  # i.e. A required argument is missing from the config file
            sys.stderr.write(
                "ERROR: Unable to locate a required argument in the bwa config file \'%s\'\n"
                % (configPath))
            raise e

    def sortAndRetag(inFile, outFile, bwaConfigPath):
        """
        Recalculare the MD and NM tags of the secified SAM file, and sort it

        :param inFile: A string containing a filepath to an input BAM file. Usually generated by clipOverlap
        :param outFile: A string containing an output filepath
        :param bwaConfigPath: A string containing a filepath to the BWA config file
        :return:
        """

        # Parse the reference genome location from the BWA config file
        bwaConfig = ConfigObj(bwaConfigPath)["bwa"]
        refGenome = bwaConfig["reference"]

        calmdCom = ["samtools", "calmd", inFile, refGenome,
                    "-b"]  # Recalculate MD and NM tags
        sortCom = ["samtools", "sort", "-o", outFile]

        # To cleanup the terminal, we are going to buffer the stderr stream of every process into a variable
        # If a samtools task crashes (exit code != 0), we will print out everything that is buffered
        calmdStderr = []
        sortStderr = []

        calmdTask = subprocess.Popen(calmdCom,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)
        sortTask = subprocess.Popen(sortCom,
                                    stdin=calmdTask.stdout,
                                    stderr=subprocess.PIPE)

        # Parse through the stderr lines of samtools, and buffer them as necessary
        for calmdLine in calmdTask.stderr:
            calmdStderr.append(calmdLine.decode("utf-8"))

        for sortLine in sortTask.stderr:
            sortStderr.append(sortLine.decode("utf-8"))

        calmdTask.stdout.close()
        calmdTask.wait()
        sortTask.wait()

        if calmdTask.returncode != 0 or sortTask.returncode != 0:  # i.e. Something crashed
            sys.stderr.write(
                "ERROR: Samtools encountered an unexpected error and was terminated\n"
            )
            sys.stderr.write("Samtools calmd Standard Error Stream:\n")
            sys.stderr.write("\n".join(calmdStderr))
            sys.stderr.write("Samtools Sort Standard Error Stream:\n")
            sys.stderr.write("\n".join(sortStderr))
            exit(1)

        # Finally, index the BAM file
        subprocess.check_call(["samtools", "index", outFile])

    printPrefix = "DELLINGR-MAIN\t\t" + sampleName
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'),
        "Processing Sample \'%s\'\n" % sampleName.rstrip()
    ]))

    # Run Trim
    trimDone = os.path.join(
        sampleDir, "config",
        "Trim_Complete")  # Similar to Make's "TASK_COMPLETE" file
    if not os.path.exists(
            trimDone
    ):  # i.e. Did Trim already complete for this sample? If so, do not re-run it
        trimConfig = os.path.join(
            sampleDir, "config",
            "trim_task.ini")  # Where is Trim's config file?
        # Is there a trim config file? If not, then we don't need to run trim, as the sample doesn't have barcodes
        if os.path.exists(trimConfig):
            trimPrintPrefix = "DELLINGR-TRIM\t\t" + sampleName
            Trim.main(sysStdin=["--config", trimConfig],
                      printPrefix=trimPrintPrefix)  # Actually run Trim
            open(trimDone, "w").close(
            )  # After Trim completes, it will create this file, signifying to the end user that this task completed

    # Run bwa
    bwaDone = os.path.join(sampleDir, "config", "BWA_Complete")
    if not os.path.exists(bwaDone):
        bwaConfig = os.path.join(sampleDir, "config", "bwa_task.ini")
        runBWA(bwaConfig)
        open(bwaDone, "w").close()

    # Run Collapse
    collapseDone = os.path.join(sampleDir, "config", "Collapse_Complete")
    if not os.path.exists(collapseDone):
        collapseConfig = os.path.join(sampleDir, "config", "collapse_task.ini")
        collapsePrintPrefix = "DELLINGR-COLLAPSE\t" + sampleName
        Collapse.main(sysStdin=["--config", collapseConfig],
                      printPrefix=collapsePrintPrefix)

        # Sort the collapse BAM output output
        # Parse the config file for the output file name
        collapseConfArgs = ConfigObj(collapseConfig)
        sortInput = collapseConfArgs["collapse"]["output"]
        # Append "sort" as the output file name
        sortOutput = sortInput.replace(".bam", ".sort.bam")
        tmpDir = os.sep + "tmp" + os.sep
        resultsDir = os.sep + "results" + os.sep
        sortOutput = sortOutput.replace(tmpDir, resultsDir)
        sys.stderr.write("\t".join([
            printPrefix,
            time.strftime('%X'),
            "Sorting final BAM file, and recalculating tags...\n"
        ]))
        bwaConfig = os.path.join(sampleDir, "config", "bwa_task.ini")
        sortAndRetag(sortInput, sortOutput, bwaConfig)

        open(collapseDone, "w").close()

    # Run call (variant calling)
    callDone = os.path.join(sampleDir, "config", "Call_Complete")
    pipelineDone = os.path.join(sampleDir, "config", "Pipeline_Complete")
    if not os.path.exists(callDone):
        callConfig = os.path.join(sampleDir, "config", "call_task.ini")
        callPrintPrefix = "DELLINGR-CALL\t\t" + sampleName
        Call.main(sysStdin=["--config", callConfig],
                  printPrefix=callPrintPrefix)
        open(callDone, "w").close()

        # Mark this sample as fully processed
        open(pipelineDone, "w").close()

    # Cleanup intermediate files (if specified)
    if cleanup:
        tmpDir = os.path.join(sampleDir, "tmp")
        tmpFiles = os.listdir(tmpDir)
        for tmpFile in tmpFiles:
            os.remove(os.path.join(tmpDir, tmpFile))

    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'),
        "%s: Pipeline Complete\n" % sampleName.rstrip()
    ]))
    def sendBillingInfoRecordsToGratia(self):
        """
        This is the public method for starting the dCache-transfer reporting.

        This will query records no more than _maxAge old, and always starts
        queries on hour time boundaries (i.e., 1:00:00 not 1:02:00).

        This will continue to query until we hit records starting less than 75
        minutes ago, then return.

        By default, we start with querying 60-second intervals, but will shrink
        this window if we encounter lots of data.

        If not summarizing: this method uses _execute to get all the data for
           a given interval, then uses _processResults to send them to Gratia.
           Once the query for a time interval is done, then we immediately
           checkpoint.

        If summarizing: this method continues to query until it hits the end of
           an hour interval.  At that point, it summarizes once again, and sends
           the summaries up to Gratia.  We then only checkpoint on the hour.
        """
        self._log.debug("sendBillingInfoRecordsToGratia")

        # Query no more than a set number of days in the past
        minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0)
        minTime = datetime.datetime(minTime.year, minTime.month, minTime.day,
                                    minTime.hour, 0, 0)

        # The latest allowed record is 75 minutes in the past, in order to make
        # sure we only query complete intervals
        latestAllowed = datetime.datetime.now() - datetime.timedelta(
            0, 75 * 60)

        if (TestContainer.isTest()):
            latestAllowed = TestContainer.getEndDateTime()

        # Start with either the last checkpoint or minTime days ago, whichever
        # is more recent.
        starttime = max(self._BIcheckpoint.lastDateStamp(), minTime)
        self._log.info("Starting queries at time %s." % starttime)

        dictRecordAgg = TimeBinRange.DictRecordAggregator(
            DCACHE_AGG_FIELDS, DCACHE_SUM_FIELDS)

        nextSummary = self._determineNextEndtime(starttime, summary=True)
        if self._summarize:
            self._log.debug("Next summary send time: %s." % nextSummary)

        results = []
        endtime = self._determineNextEndtime(starttime)
        totalRecords = 0
        # Loop until we have caught up to latestAllowed.
        while starttime < latestAllowed:
            assert starttime < endtime
            self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \
                'starting at %s.' % starttime)
            # We are guaranteed that starttime will move forward to the value of
            # endtime every time we call execute.
            next_starttime, rows = self._execute(starttime, endtime,
                                                 self._maxSelect)

            results += rows
            totalRecords += len(rows)
            if self._summarize:
                # Summarize the partial results
                results = Collapse.collapse(results, dictRecordAgg)
            assert next_starttime > starttime
            next_endtime = self._determineNextEndtime(next_starttime)

            # If we're not summarizing, we send up records each loop.
            if (not self._summarize) and results:
                totalRecords = 0
                # We now have all the rows we want; process them
                self._BIcheckpoint.createPending(endtime, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                if (self._range < STARTING_RANGE and len(results)*4 < \
                       self._maxSelect):
                    self._range = STARTING_RANGE
                results = []
            # If we are summarizing, send records only per hour of data
            elif (next_endtime > nextSummary) and results:
                num_agg = totalRecords - len(results)
                if num_agg:
                    factor = float(totalRecords) / float(len(results))
                    self._log.info("Aggregated %i of %i records for time " \
                        "interval ending in %s.  %.1fx reduction." % \
                        (num_agg, totalRecords, nextSummary, factor))
                else:
                    self._log.debug("Unable to aggregate any of %i records" \
                        % totalRecords)
                totalRecords = 0
                self._BIcheckpoint.createPending(nextSummary, '')
                self._processResults(results)
                self._BIcheckpoint.commit()
                results = []
                self._range = STARTING_RANGE

            nextSummary = self._determineNextEndtime(next_starttime,
                                                     summary=True)

            endtime = next_endtime
            starttime = next_starttime

            # Check to see if the stop file has been created.  If so, break
            if os.path.exists(self._stopFileName):
                #Neha - 03/17/2011
                #Don't need to commit anything since we are only doing select and no inserts or updates
                self._cur.close()
                self._connection.close()
                break