Exemplo n.º 1
0
    def queryItems(self, options, outputFile):
        """Query for all clinical item records that fulfill the options criteria
        and then send the results as tab-delimited output to the outputFile.
        """
        pauseSeconds = float(options.pauseSeconds)

        query = SQLQuery()
        query.addSelect("count(order_med_id_coded) as nOrders")
        query.addSelect("om.med_route, om.medication_id, om.med_description")
        query.addFrom("starr_datalake2018.order_med as om")
        if options.descriptionPrefix:
            query.addWhereOp("om.med_description", "like",
                             options.descriptionPrefix + "%%")
            # Add wildcard to enabe prefix search
        if options.medRoutes:
            query.addWhereIn("om.med_route", options.medRoutes.split(","))
        query.addGroupBy("om.medication_id, om.med_description, om.med_route")
        query.addOrderBy("nOrders desc, om.med_description")

        formatter = TextResultsFormatter(outputFile)

        prog = ProgressDots()
        for row in DBUtil.execute(query,
                                  includeColumnNames=True,
                                  connFactory=self.connFactory):
            formatter.formatTuple(row)
            time.sleep(pauseSeconds)
            prog.update()
        prog.printStatus()
Exemplo n.º 2
0
def main(nResamples = 1000, sampleSize = 200, positiveCount = 180):
	

	
	
	confidenceInterval = 0.95

	sample = list()
	for i in range(sampleSize):
		value = 0;
		if i < positiveCount:
			value = 1;
		sample.append(value);
	sample = np.array(sample);

	print("Sample Rate: ",calcStat(sample) );

	rng = np.random.RandomState(nResamples);	# Seed with number for consistency
	bootstrapValues = list();

	prog = ProgressDots();
	for i in range(nResamples):
		indices = rng.random_integers(0, sampleSize-1, sampleSize);	# Get indices of resample items (with replacement)
		resample = sample[indices];
		bootstrapValues.append( calcStat(resample) );
		prog.update();

	ciFractionLow = (1.0-confidenceInterval) / 2;
	ciFractionHigh= confidenceInterval + ciFractionLow;

	bootstrapValues.sort();
	print("Sample Rate 95%CI Low", bootstrapValues[int(ciFractionLow*nResamples)] )
	print("Sample Rate 95%CI High", bootstrapValues[int(ciFractionHigh*nResamples)] )
Exemplo n.º 3
0
    def convertSourceItems(self, convOptions, conn=None):
        """Primary run function to process the contents of the raw source
        table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies after the fact to catch repeated conversions.

        startDate - If provided, only return items whose ordering_date is on or after that date.
        endDate - If provided, only return items whose ordering_date is before that date.
        """
        log.info("Conversion for items dated %s to %s" %
                 (convOptions.startDate, convOptions.endDate))
        progress = ProgressDots()

        extConn = conn is not None
        if not extConn:
            conn = self.connFactory.connection()

        try:
            # Next round for medications directly from order_med table not addressed in medmix  TODO (nodir) seems like an unrelated comment?
            category = self.categoryFromSourceItem(conn)
            for sourceItem in self.querySourceItems(convOptions):
                log.debug('sourceItem: {}'.format(sourceItem))
                self.convertSourceItem(category, sourceItem, conn=conn)
                progress.Update()

        finally:
            conn.close()

        progress.PrintStatus()
Exemplo n.º 4
0
def parseLabResultsFile(labFile):
    log.info("Parse lab results file");
    prog = ProgressDots();
    labsByBaseNameByPatientId = dict(); # Dictionary of dictionaries of lists of result items
    for labResult in TabDictReader(labFile):
        if labResult["ord_num_value"] is not None and labResult["ord_num_value"] != NULL_STRING:
            patientId = int(labResult["pat_id"]);
            labBaseName = labResult["base_name"];
            resultValue = float(labResult["ord_num_value"]);
            resultTime = DBUtil.parseDateValue(labResult["result_time"]);

            if resultValue < LAB_SENTINEL_VALUE:    # Skip apparent placeholder values
                labResult["pat_id"] = labResult["patient_id"] = patientId;
                labResult["ord_num_value"] = resultValue;
                labResult["result_time"] = resultTime;

                if patientId not in labsByBaseNameByPatientId:
                    labsByBaseNameByPatientId[patientId] = dict();
                if labBaseName not in labsByBaseNameByPatientId[patientId]:
                    labsByBaseNameByPatientId[patientId][labBaseName] = list();
                labsByBaseNameByPatientId[patientId][labBaseName].append( labResult );

        prog.update();
    prog.printStatus();
    return labsByBaseNameByPatientId;
Exemplo n.º 5
0
    def queryItems(self, options, outputFile):
        """Query for all clinical item records that fulfill the options criteria
        and then send the results as tab-delimited output to the outputFile.
        """
        pauseSeconds = float(options.pauseSeconds)

        query = SQLQuery()
        query.addSelect(
            "cic.description, ci.clinical_item_id, ci.name, ci.description")
        query.addFrom("clinical_item_category as cic")
        query.addFrom("clinical_item as ci")
        query.addWhere(
            "cic.clinical_item_category_id = ci.clinical_item_category_id")
        if options.itemPrefix:
            query.addWhereOp("ci.description", "like",
                             options.itemPrefix + "%%")
            # Add wildcard to enabe prefix search
        if options.categoryNames:
            query.addWhereIn("cic.description",
                             options.categoryNames.split(","))
        query.addOrderBy(
            "cic.description, ci.name, ci.description, ci.clinical_item_id")

        formatter = TextResultsFormatter(outputFile)

        prog = ProgressDots()
        for row in DBUtil.execute(query,
                                  includeColumnNames=True,
                                  connFactory=self.connFactory):
            formatter.formatTuple(row)
            time.sleep(pauseSeconds)
            prog.update()
        prog.printStatus()
Exemplo n.º 6
0
    def convertSourceItems(self, convOptions):
        """Primary run function to process the contents of the order_med
        table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies after the fact to catch repeatEd conversions.

        startDate - If provided, only return items whose order_time_jittered is on or after that date.
        endDate - If provided, only return items whose order_time_jittered is before that date.
        """
        log.info("Conversion for items dated {} to {}".format(
            convOptions.startDate, convOptions.endDate))
        progress = ProgressDots()
        conn = self.connFactory.connection()
        try:
            # Load up the medication mapping table to facilitate subsequent conversions
            rxcuiDataByMedId = self.loadRXCUIData()

            # Next round for medications directly from order_med table not addressed in medmix
            for sourceItem in self.querySourceItems(rxcuiDataByMedId,
                                                    convOptions,
                                                    progress=progress,
                                                    conn=conn):
                self.convertSourceItem(sourceItem, conn=conn)
                progress.Update()

        finally:
            conn.close()
        progress.PrintStatus()
Exemplo n.º 7
0
    def analyzePatientItems(self, analysisOptions):
        """Primary run function to analyze patient clinical item data and
        record updated stats to the respective database tables.

        Does the analysis only for records pertaining to the given patient IDs
        (provides a way to limit the extent of analysis depending on params).

        Will also record analyze_date timestamp on any records analyzed,
        so that analysis will not be repeated if called again on the same records.
        """
        progress = ProgressDots();
        conn = self.connFactory.connection();

        try:
            # Preload lookup data to facilitate rapid checks and filters later
            linkedItemIdsByBaseId = self.dataManager.loadLinkedItemIdsByBaseId(conn=conn);

            # Keep an in memory buffer of the updates to be done so can stall and submit them
            #   to the database in batch to minimize inefficient DB hits
            updateBuffer = self.makeUpdateBuffer();
            log.info("Main patient item query...")
            for iPatient, patientItemList in enumerate(self.queryPatientItemsPerPatient(analysisOptions, progress=progress, conn=conn)):
                log.debug("Calculate associations for Patient %d's %d patient items. %d associations in buffer." % (iPatient, len(patientItemList), updateBuffer["nAssociations"]) );
                self.updateItemAssociationsBuffer(patientItemList, updateBuffer, analysisOptions, linkedItemIdsByBaseId, progress=progress);
                if self.readyForIntervalCommit(iPatient, updateBuffer, analysisOptions):
                    log.info("Commit after %s patients" % (iPatient+1) );
                    self.persistUpdateBuffer(updateBuffer, linkedItemIdsByBaseId, analysisOptions, iPatient, conn=conn);  # Periodically commit update buffer
                else:   # If not committing, still send a quick arbitrary query to DB,
                        # otherwise connection may get recycled because DB thinks timeout with no interaction
                    DBUtil.execute("select 1+1", conn=conn);
            log.info("Final commit / persist");
            self.persistUpdateBuffer(updateBuffer, linkedItemIdsByBaseId, analysisOptions, -1, conn=conn);  # Final update buffer commit. Don't use iPatient here, as may collide if interval commit happened to land on last patient
        finally:
            conn.close();
Exemplo n.º 8
0
def addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, indexItemBaseName, labBaseNames, labPreTime, labPostTime):
    log.info("Sort lab results by result time for each patient and find items within specified time period to aggregate");
    prog = ProgressDots();
    for iPatient, (patientId, labsByBaseName) in enumerate(labsByBaseNameByPatientId.iteritems()):
        # Look for the first result of the index item (ferritin)
        indexItem = None;
        if indexItemBaseName in labsByBaseName:
            for labResult in labsByBaseName[indexItemBaseName]:
                if indexItem is None or labResult["result_time"] < indexItem["result_time"]:
                    indexItem = labResult;

        if indexItem is not None:   # Skip this patient if no index item found, should not be possible since pre-screened for relevant patients
            indexTime = indexItem["result_time"];

            patient = patientById[patientId];
            patient["index_time"] = indexTime;

            preTimeLimit = indexTime+labPreTime;
            postTimeLimit = indexTime+labPostTime;

            # Init values for each lab of interest to an empty list
            for labBaseName in labBaseNames:
                # Default to null for all values
                patient["%s.min" % labBaseName] = None;
                patient["%s.max" % labBaseName] = None;
                patient["%s.median" % labBaseName] = None;
                patient["%s.mean" % labBaseName] = None;
                patient["%s.std" % labBaseName] = None;
                patient["%s.first" % labBaseName] = None;
                patient["%s.last" % labBaseName] = None;
                patient["%s.proximate" % labBaseName] = None;
                
                proximateValue = None;
                if labBaseName in labsByBaseName:   # Not all patients will have all labs checked
                    proximateItem = None;   # Item closest to the index item in time
                    valueList = list();
                    for labResult in labsByBaseName[labBaseName]:
                        resultTime = labResult["result_time"];
                        if preTimeLimit <= resultTime and resultTime < postTimeLimit:
                            # Occurs within time frame of interest, so record this value
                            valueList.append(labResult["ord_num_value"]);
                            
                        if proximateItem is None or (abs(resultTime-indexTime) < abs(proximateItem["result_time"]-indexTime)):
                            proximateItem = labResult;
                    proximateValue = proximateItem["ord_num_value"];
                
                    if len(valueList) > 0:
                        patient["%s.min" % labBaseName] = np.min(valueList);
                        patient["%s.max" % labBaseName] = np.max(valueList);
                        patient["%s.median" % labBaseName] = np.median(valueList);
                        patient["%s.mean" % labBaseName] = np.mean(valueList);
                        patient["%s.std" % labBaseName] = np.std(valueList);
                        patient["%s.first" % labBaseName] = valueList[0];   # Assumes previously sorted 
                        patient["%s.last" % labBaseName] = valueList[-1];   #   by result_time
                    patient["%s.proximate" % labBaseName] = proximateValue;

                prog.update();
    colNames.extend(colsFromLabBaseNames(labBaseNames));
    prog.printStatus();
Exemplo n.º 9
0
    def generatePatientItemsForCompositeId(self,
                                           clinicalItemIds,
                                           compositeId,
                                           conn=None):
        """Create patient_item records for the composite to match the given clinical item ID patient items.
        """
        extConn = True
        if conn is None:
            conn = self.connFactory.connection()
            extConn = False
        try:
            # Record linking information
            for componentId in clinicalItemIds:
                linkModel = RowItemModel()
                linkModel["clinical_item_id"] = compositeId
                linkModel["linked_item_id"] = componentId

                insertQuery = DBUtil.buildInsertQuery("clinical_item_link",
                                                      linkModel.keys())
                insertParams = linkModel.values()
                DBUtil.execute(insertQuery, insertParams, conn=conn)

            # Extract back link information, which will also flatten out any potential inherited links
            linkedItemIdsByBaseId = self.loadLinkedItemIdsByBaseId(conn=conn)
            linkedItemIds = linkedItemIdsByBaseId[compositeId]

            # Create patienItem records for the composite clinical item to overlap existing component ones
            # First query for the existing component records
            query = SQLQuery()
            query.addSelect("*")
            query.addFrom("patient_item")
            query.addWhereIn("clinical_item_id", linkedItemIds)
            results = DBUtil.execute(query, includeColumnNames=True, conn=conn)
            patientItems = modelListFromTable(results)

            # Patch component records to instead become composite item records then insert back into database
            progress = ProgressDots(total=len(patientItems))
            for patientItem in patientItems:
                del patientItem["patient_item_id"]
                patientItem["clinical_item_id"] = compositeId
                patientItem["analyze_date"] = None

                insertQuery = DBUtil.buildInsertQuery("patient_item",
                                                      patientItem.keys())
                insertParams = patientItem.values()

                try:
                    # Optimistic insert of a new unique item
                    DBUtil.execute(insertQuery, insertParams, conn=conn)
                except conn.IntegrityError, err:
                    # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible
                    log.info(err)
                progress.Update()

            # progress.PrintStatus();
        finally:
            if not extConn:
                conn.close()
Exemplo n.º 10
0
    def buildModel(self, corpusBOWGenerator, numTopics):
        """Build topic model from corpus (interpret as generator over contents)

        Given the bag-of-words corpus, build a docCountByWordId count dictionary
        to facilitate subsequent Term Frequency * Inverse DOCUMENT FREQUENCY calculations.
        In Clinical context, document = patient.

        Return (model, docCountByWordId);
        """
        # Load dictionary to translate item IDs to descriptions
        itemsById = DBUtil.loadTableAsDict("clinical_item")
        id2word = dict()
        # Models expect a pair for every possible item ID, and judges vocabulary size by length of this dictionary rather than the maximum ID values.  That means have to populate all of the empty ones as well.
        maxId = max(itemsById.keys())
        for itemId in range(maxId + 1):
            description = str(itemId)
            # Default to just the same as the ID string
            if itemId in itemsById:
                description = itemsById[itemId]["description"]
            id2word[itemId] = description

        # Stream in progressive updates from corpus generator so don't have to load all into memory
        # Do a batch of many at a time, otherwise very slow to increment one at a time
        docBuffer = list()

        prog = ProgressDots()
        self.model = None
        self.docCountByWordId = {
            None: 0
        }
        # Use None key to represent count of all documents
        for i, document in enumerate(corpusBOWGenerator):
            for (
                    wordId, wordCount
            ) in document:  # Assuming uniqueness of wordId keys for each document
                if wordId not in self.docCountByWordId:
                    self.docCountByWordId[wordId] = 0
                self.docCountByWordId[wordId] += 1
            self.docCountByWordId[None] += 1

            docBuffer.append(document)
            if i % BUFFER_UPDATE_SIZE == (
                    BUFFER_UPDATE_SIZE -
                    1):  # Update model with current buffer of documents
                self.model = self.updateModel(self.model, docBuffer, id2word,
                                              numTopics)
                docBuffer = list()
                # Discard committed buffer
            prog.update()

        self.model = self.updateModel(self.model, docBuffer, id2word,
                                      numTopics)
        # Last update for any remaining documents in buffer
        docBuffer = list()
        # Discard committed buffer

        # prog.printStatus();
        return (self.model, self.docCountByWordId)
Exemplo n.º 11
0
    def main(self, argv):
        """Main method, callable from command line"""
        usageStr = "usage: %prog -p <pid> -c <commandStr>\n"
        parser = OptionParser(usage=usageStr)
        parser.add_option(
            "-p",
            "--pid",
            dest="pid",
            help=
            "Process ID to monitor.  As soon as it is no longer found by Unix signal (os.kill(pid,0)), will proceed to execute the given command"
        )
        parser.add_option(
            "-i",
            "--interval",
            dest="interval",
            default="1",
            help=
            "How many seconds to wait before checking if the PID is still active.  Default to 1 second."
        )
        parser.add_option(
            "-c",
            "--commandStr",
            dest="commandStr",
            help=
            "Command string to execute after the specified PID is longer found.  Will just pass whole string to subprocess"
        )
        (options, args) = parser.parse_args(argv[1:])

        log.info("Starting: " + str.join(" ", argv))
        timer = time.time()

        ###########---From medinfo/common/support/awaitProcess.py)---#############
        if options.pid and options.commandStr:
            pid = int(options.pid)
            interval = float(options.interval)

            prog = ProgressDots(60, 1, "intervals")
            while pidExists(pid):
                time.sleep(interval)
                prog.update()
            # prog.printStatus();

            print >> sys.stderr, "Executing: ", options.commandStr
            process = subprocess.Popen(options.commandStr)
            print >> sys.stderr, "Started process: ", process.pid
        else:
            parser.print_help()
            sys.exit(-1)

        timer = time.time() - timer
        print >> sys.stderr, ("%.3f seconds to complete" % timer)
        ###---END awaitProcess method---###

        timer = time.time() - timer
        log.info("%.3f seconds to complete", timer)
Exemplo n.º 12
0
    def analyzePatientItems(self, patientIds, itemIdSequence, virtualItemId):
        """Primary run function to analyze patient clinical item data and
        record updated stats to the respective database tables.

        Does the analysis only for records pertaining to the given patient IDs
        (provides a way to limit the extent of analysis depending on params).

        Note that this does NOT record analyze_date timestamp on any records analyzed,
        as would collide with AssociationAnalysis primary timestamping, thus it is the
        caller's responsibility to be careful not to repeat this analysis redundantly
        and generating duplicated statistics.
        """
        progress = ProgressDots()
        conn = self.connFactory.connection()
        try:
            # Preload lookup data to facilitate rapid checks and filters later
            linkedItemIdsByBaseId = self.dataManager.loadLinkedItemIdsByBaseId(
                conn=conn)
            self.verifyVirtualItemLinked(itemIdSequence,
                                         virtualItemId,
                                         linkedItemIdsByBaseId,
                                         conn=conn)

            # Keep an in memory buffer of the updates to be done so can stall and submit them
            #   to the database in batch to minimize inefficient DB hits
            updateBuffer = dict()
            log.info("Main patient item query...")
            analysisOptions = AnalysisOptions()
            analysisOptions.patientIds = patientIds
            for iPatient, patientItemList in enumerate(
                    self.queryPatientItemsPerPatient(analysisOptions,
                                                     progress=progress,
                                                     conn=conn)):
                log.debug(
                    "Calculate associations for Patient %d's %d patient items"
                    % (iPatient, len(patientItemList)))
                self.updateItemAssociationsBuffer(itemIdSequence,
                                                  virtualItemId,
                                                  patientItemList,
                                                  updateBuffer,
                                                  linkedItemIdsByBaseId,
                                                  progress=progress)
                # Periodically send a quick arbitrary query to DB, otherwise connection may get recycled because DB thinks timeout with no interaction
                DBUtil.execute("select 1+1", conn=conn)
            log.info("Final commit")
            self.commitUpdateBuffer(updateBuffer,
                                    linkedItemIdsByBaseId,
                                    conn=conn)
            # Final update buffer commit
        finally:
            conn.close()
    def __call__(self, analysisQuery, conn=None):
        extConn = True;
        if conn is None:
            conn = self.connFactory.connection();
            extConn = False;

        try:
            # Preload some lookup data to facilitate subsequent checks
            categoryIdByItemId = dict();
            lookupTable = DBUtil.execute("select clinical_item_id, clinical_item_category_id from clinical_item", conn=conn);
            for (clinicalItemId, categoryId) in lookupTable:
                categoryIdByItemId[clinicalItemId] = categoryId;

            # Recommender to test with
            recommender = analysisQuery.recommender;

            # Start building basic recommendation query to use for testing
            recQuery = analysisQuery.baseRecQuery;

            # Start building results data
            resultsTable = list();
            progress = ProgressDots(50,1,"Item Recommendations");

            # Query for all of the order / item data for the test patients.  Load one patient's data at a time
            for (patientId, clinicalItemIdList) in self.queryPatientClinicalItemData(analysisQuery, conn=conn):
                # Parse through the patient's item list and run serial recommendation queries
                #   to find each item's accumulated recommendation rank
                serialRecDataGen = \
                    self.reviewSerialRecommendations \
                    (   patientId,
                        clinicalItemIdList,
                        analysisQuery,
                        recQuery,
                        recommender,
                        categoryIdByItemId,
                        progress=progress,
                        conn=conn
                    );

                for (clinicalItemId, iItem, iRecItem, recRank, recScore) in serialRecDataGen:
                    resultsRow = (patientId, clinicalItemId, iItem, iRecItem, recRank, recScore);
                    resultsTable.append(resultsRow);

            # progress.PrintStatus();

            #print resultsTable
            return resultsTable;

        finally:
            if not extConn:
                conn.close();
    def convertSourceItems(self, patientIds=None):
        """Primary run function to process the contents of the starr_datalake2018.demographic
        table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies to avoid repeating conversion.

        patientIds - If provided, only process items for patient IDs matching those provided
        """
        log.info("Conversion for patients starting with: %s, %s total" % (patientIds[:5], len(patientIds)))
        progress = ProgressDots()

        with self.connFactory.connection() as conn:
            category_model = self.categoryFromSourceItem(conn)   # only 1 category - no need to have it in the loop
            for sourceItem in self.querySourceItems(patientIds, progress):
                self.convertSourceItem(category_model, sourceItem, conn)
Exemplo n.º 15
0
    def convertSourceItems(self, patientIds=None):
        """Primary run function to process the contents of the stride_patient
        table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies to avoid repeating conversion.

        patientIds - If provided, only process items for patient IDs matching those provided
        """
        log.info("Conversion for patients starting with: %s, %s total" %
                 (patientIds[:5], len(patientIds)))
        progress = ProgressDots()

        with self.connFactory.connection() as conn:
            for sourceItem in self.querySourceItems(patientIds,
                                                    progress=progress):
                self.convertSourceItem(sourceItem, conn=conn)
Exemplo n.º 16
0
    def convertSourceItems(self, userSIDs=None, limit=None, offset=None):
        """Primary run function to process the contents of the source table
        and convert them into normalized data table entries.
        """
        log.info("Conversion for patients: %s" % userSIDs);
        progress = ProgressDots();
        conn = self.connFactory.connection();
        try:
            for i, sourceItem in enumerate(self.querySourceItems(userSIDs, limit, offset, progress=progress, conn=conn)):
                self.convertSourceItem(sourceItem, conn=conn);
                progress.Update();

            # Go through accumulated metric description lines into single entries for the metric table
            self.updateMetricDescriptionLines();
        finally:
            conn.close();
Exemplo n.º 17
0
    def convertSourceItems(self, patientIds=None):
        """Primary run function to process the contents of the stride_patient
        table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies to avoid repeating conversion.

        patientIds - If provided, only process items for patient IDs matching those provided
        """
        log.info("Conversion for patients: %s" % patientIds)
        progress = ProgressDots()
        conn = self.connFactory.connection()
        try:
            for sourceItem in self.querySourceItems(patientIds,
                                                    progress=progress,
                                                    conn=conn):
                self.convertSourceItem(sourceItem, conn=conn)
        finally:
            conn.close()
    def convertSourceItems(self, startDate=None, endDate=None):
        """Primary run function to process the contents of the source table
        and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies to avoid repeating conversion.

        startDate - If provided, only return items whose noted_date is on or after that date.
        endDate - If provided, only return items whose noted_date is before that date.
        """
        log.info("Conversion for items dated %s to %s" % (startDate, endDate));
        progress = ProgressDots();
        conn = self.connFactory.connection();
        try:
            for sourceItem in self.querySourceItems(startDate, endDate, progress=progress, conn=conn):
                self.convertSourceItem(sourceItem, conn=conn);
        finally:
            conn.close();
        progress.PrintStatus();
Exemplo n.º 19
0
def main(argv):
    conversionProcessor = STRIDEOrderResultsConversion();
    
    conn = DBUtil.connection();
    try:
        # Pull out list of result names to look for that are not already in the calculated
        nameTable = DBUtil.execute("select name from sim_result    except    select base_name from order_result_stat", conn=conn);
        prog = ProgressDots(big=1,small=1,total=len(nameTable));
        for row in nameTable:
            baseName = row[0];
            print("Calculating Stats for %s" % baseName, file=sys.stderr);
            statModel = conversionProcessor.calculateResultStats( baseName, conn=conn );
            DBUtil.insertRow("order_result_stat", statModel, conn=conn );
            prog.update();
        prog.printStatus();
        conn.commit();
    finally:
        conn.close();
Exemplo n.º 20
0
def parseClinicalItemFile(itemFile, patientIdCol="patient_id", timeCol="item_date"):
    prog = ProgressDots();
    itemTimesByPatientId = dict();
    for itemData in TabDictReader(itemFile):
        patientId = int(itemData[patientIdCol]);
        itemTime = DBUtil.parseDateValue(itemData[timeCol]);

        itemData[patientIdCol] = patientId;
        itemData[timeCol] = itemTime;

        if patientId not in itemTimesByPatientId:
            itemTimesByPatientId[patientId] = list();
        itemTimesByPatientId[patientId].append( itemTime );

        prog.update();
    prog.printStatus();

    return itemTimesByPatientId;
Exemplo n.º 21
0
    def __call__(self, inputFiles):
        """Return generator over dictionary objects representing
        the concatenated contents of the input files after adding and accounting for argv parameter columns.
        """
        # Consolidate a master set of all column headers to use
        self.colNames = list()
        # Keep track of consistent order found
        colSet = set()
        # Keep track of unique items

        # Pull out any header comments that may represent an argv list to parse
        # Pull out header row with column labels for each input file
        argvDicts = list()
        # Dictionary for each input file keyed by argv parameter name with respective value
        readers = list()
        # TabDictReader from which header columns can be accessed as fieldnames
        for inputFile in inputFiles:
            reader = TabDictReader(inputFile)
            readers.append(reader)
            for col in reader.fieldnames:
                if col not in colSet:
                    colSet.add(col)
                    self.colNames.append(col)

            argvDict = self.extract_argvDict(reader.commentLines)
            # Must be called after reader.fieldnames so initial text parsing will start
            argvDicts.append(argvDict)
            for col in argvDict.iterkeys():
                if col not in colSet:
                    colSet.add(col)
                    self.colNames.append(col)

        prog = ProgressDots(50, 1, "Files")

        # Now generate each file in succession, but "outer-joined" to include the master column header list
        for argvDict, reader in zip(argvDicts, readers):
            for resultDict in reader:
                resultDict.update(argvDict)
                for col in self.colNames:
                    if col not in resultDict:
                        resultDict[col] = None
                yield resultDict
            prog.update()
Exemplo n.º 22
0
    def convertSourceItems(self, convOptions):
        """Primary run function to process the contents of the stride_order_med
        table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies after the fact to catch repeatEd conversions.

        startDate - If provided, only return items whose ordering_date is on or after that date.
        endDate - If provided, only return items whose ordering_date is before that date.
        """
        log.info("Conversion for items dated %s to %s" %
                 (convOptions.startDate, convOptions.endDate))
        progress = ProgressDots()
        conn = self.connFactory.connection()
        try:
            # Load up the medication mapping table to facilitate subsequent conversions
            rxcuiDataByMedId = self.loadRXCUIData(conn=conn)

            # Keep track of which order meds have already been converted based on mixture components (don't repeat for the aggregate order then)
            # Can be a lot to store in local memory for large conversions, so may need to batch smaller sub-processes
            convertedOrderMedIds = set()

            # First round for medication combinations that must be extracted from order_medmixinfo table
            for sourceItem in self.queryMixSourceItems(rxcuiDataByMedId,
                                                       convOptions,
                                                       progress=progress,
                                                       conn=conn):
                self.convertSourceItem(sourceItem, conn=conn)
                convertedOrderMedIds.add(sourceItem["order_med_id"])
                progress.Update()

            # Next round for medications directly from order_med table not addressed in medmix
            for sourceItem in self.querySourceItems(rxcuiDataByMedId,
                                                    convOptions,
                                                    progress=progress,
                                                    conn=conn):
                if sourceItem[
                        "order_med_id"] not in convertedOrderMedIds:  # Don't repeat conversion if mixture components already addressed
                    self.convertSourceItem(sourceItem, conn=conn)
                progress.Update()

        finally:
            conn.close()
        progress.PrintStatus()
Exemplo n.º 23
0
    def convertSourceItems(self, convOptions):
        """Primary run function to process the contents of the raw source
        table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries.
        Should look for redundancies after the fact to catch repeated conversions.

        startDate - If provided, only return items whose ordering_date is on or after that date.
        endDate - If provided, only return items whose ordering_date is before that date.
        """
        log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate));
        progress = ProgressDots();
        conn = self.connFactory.connection();
        try:
            # Next round for medications directly from order_med table not addressed in medmix
            for sourceItem in self.querySourceItems(convOptions, progress=progress, conn=conn):
                self.convertSourceItem(sourceItem, conn=conn);
                progress.Update();

        finally:
            conn.close();
        progress.PrintStatus();
Exemplo n.º 24
0
def main_quickTest(argv):
    modelFilename = argv[1]
    modeler = TopicModel()

    timer = time.time()
    (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename)
    timer = time.time() - timer
    log.info("%.2f seconds to load", timer)

    timer = time.time()
    weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId(
        model, 100)
    timer = time.time() - timer
    log.info("%.2f seconds to generate weights", timer)

    for i in xrange(3):
        prog = ProgressDots()
        for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems():
            for (itemId, itemWeight) in weightByItemId.iteritems():
                prog.update()
        prog.printStatus()
    """
Exemplo n.º 25
0
def execute( query, parameters=None, includeColumnNames=False, incTypeCodes=False, formatter=None, 
            conn=None, connFactory=None, autoCommit=True):
    """Execute a single SQL query / command against the database.
    If the description attribute is not None, this implies
    this was a select statement that produced a result set which 
    will be returned by the fetchall() method.
    
    If the description is null, then at least return the rowcount
    affected by the query.  This may be -1 or None still
    if it is a non-row affecting command (e.g. create / drop).
    
    If includeColumnNames is true and the query yields a result set,
    then one row (list) will be added to the beginning which contains
    the names of each column as extracted from the cursor.description.
    
    If incTypeCodes is true and the query yields a result set, a row (list)
    will be added to the beginning (but after column names if those are
    included as well), which contains the numerical type codes of
    each column as extracted from the cursor.description.
    
    This method is probably not terribly efficient and should 
    only be used for prototype testing and short command line functions.
    For retrieving data to send to stdout or some other stream,
    add the formatter parameter as an instance of a ResultFormatter
    object to pipe the data through one fetch at a time.  In that case,
    the full results (which are presumably large) will NOT be returned
    by the method.
    
    If the query object is actually a SQLQuery object, then will
    use the SQLQuery.getParams() as the params, and str(SQLQuery)
    as the query string.
    
    If autoCommit is True, will autoCommit.  The function will also autoCommit if an external connection 
    is NOT supplied.  
    """
    # Look for an explicitly specified external connection
    extConn = conn is not None
    if conn is None:
        # If no specific connection object provided, look for a connection factory
        #   to produce one
        if connFactory is not None:
            conn = connFactory.connection()
        else:
            # No connection or factory specified, just fall back on default connection then
            conn = connection()

    cur = conn.cursor()
    
    if isinstance(query, SQLQuery):
        if parameters is None:
            parameters = tuple(query.getParams())
        else:
            parameters = tuple(parameters)
        query = str(query)
    elif parameters is None:
        parameters = ()

    #log.debug(parameterizeQueryString(query,parameters));

    returnValue = None    
    try:
        timer = time.time();
        try:
            cur.execute( query, parameters )
        except Exception, err:
            log.error(err);
            #log.error(parameterizeQueryString(query,parameters));
            if (not extConn) or autoCommit: 
                conn.rollback();
            raise;
        timer = time.time() - timer;
        log.debug("Query Time: (%1.3f sec)" % timer );

        if cur.description != None:
            returnValue = []

            colNames = None;
            if includeColumnNames:
                colNames = columnNamesFromCursor(cur);
                returnValue.append(colNames)

            if incTypeCodes:
                typeCodes = typeCodesFromCursor(cur);
                returnValue.append(typeCodes);
            
            
            if formatter != None:
                # An output formatter was specified, pipe the data out one row at time
                if includeColumnNames:
                    formatter.formatTuple(colNames)
                
                progress = ProgressDots();
                row = cur.fetchone()
                while row != None:
                    formatter.formatTuple(row)
                    row = cur.fetchone()
                    progress.Update();
                log.info("%d Rows Completed",progress.GetCounts());                

                returnValue = cur.rowcount
            else:
                # No formatter specified, just return the entire result set
                dataTable = list(cur.fetchall());
                for i, row in enumerate(dataTable):
                    dataTable[i] = list(row);
                returnValue.extend(dataTable);
        else:
            returnValue = cur.rowcount
        if (not extConn) or autoCommit: 
            conn.commit()
Exemplo n.º 26
0
    'LABNTBNP', 'LABOSM', 'LABPALB', 'LABPCCG4O', 'LABPCCR', 'LABPCTNI',
    'LABPHOS', 'LABPLTS', 'LABPROCT', 'LABPT', 'LABPTEG', 'LABPTT', 'LABRESP',
    'LABRESPG', 'LABRETIC', 'LABSPLAC', 'LABSTLCX', 'LABSTOBGD', 'LABTNI',
    'LABTRFS', 'LABTRIG', 'LABTSH', 'LABUCR', 'LABUOSM', 'LABUA', 'LABUAPRN',
    'LABUPREG', 'LABURIC', 'LABURNA', 'LABURNC', 'LABUSPG'
]

labs_to_test = ["LABLACWB"]
#    [#"LABMGN, #"LABPTT", #"LABPHOS", "LABTNI", #"LABLACWB",
#    #"LABA1C", #"LABHEPAR", #"LABPLTS", "LABLAC","LABLIPS",
#    "LABTSH", "LABHCTX", "LABLDH", "LABK", "LABNTBNP",
#    "LABCRP", "LABCK", #"LABFER",
#    "LABUSPG"]

baseArgv = \
    ["python","LabChangePredictionPipeline.py"]

prog = ProgressDots(1, 1, "Processes", total=len(labs_to_test))
for lab_name in labs_to_test:
    key = "%.3d.%s" % (prog.getCounts(), lab_name)
    key = key.replace(
        "/",
        "..")  # Don't want to use directory separator in temp log file name
    argv = list(baseArgv)
    argv.append(lab_name)
    log.info("Starting: " + str.join(" ", argv))
    logFile = stdOpen(LOG_FILE_TEMPLATE % key, "w")

    p = subprocess.Popen(argv, stderr=logFile)
    # Blocking sub-process call. Can just run multiple instances to parallelize, or consider  subprocess.Popen objects instead
Exemplo n.º 27
0
def insertFile( sourceFile, tableName, columnNames=None, delim=None, idFile=None, skipErrors=False, dateColFormats=None, escapeStrings=False, estInput=None, connFactory=None ):
    """Insert the contents of a whitespace-delimited text file into the database.
    
    For PostgreSQL specifically, consider alternative direct COPY command that can run 10x:
    E.g., gzip -d -c TTeam_2014.tsv.gz | psql -U jonc101 -c "COPY tteamx (  pat_deid,  enc_deid,  relation,  prov_id,  prov_name,  start_date,  end_date ) FROM STDIN WITH (FORMAT csv, DELIMITER E'\t', HEADER, NULL 'None');" resident-access-log-2017

    Inserts the contents of the <sourceFile> into the database
    under the <tableName>.  One line is expected in the <sourceFile>
    per row in the database, with each item delimited by the <delim>
    character.  These items will be inserted under the respective
    order of the given list of columnNames.

    Use the built-in csv module for parsing out lines and managing quotes, etc.
    If delimiter is not specified (None), then default to tab-delimited
    
    If idFile is provided, then will try to run SQL from identityQuery method
    after each insert, and write out the contents, one per line to the idFile.
    Will bypass above step if can find an insert column with the expected default ID column ("tableName_id")
    
    If dateColFormats provided, expect a dictionary keyed by the names of columns
    that should be as interpreted date strings, with values equal to the 
    Python date format string to parse them by.  
    If a format string is not provided, a series of standard date format strings will be attempted 
    (but this is inefficient for repeated date text parsing and error handling).
    
    Returns the total number of rows successfully inserted.
    """
    if columnNames is not None and len(columnNames) < 1:
        columnNames = None; # If empty columnNames list, then reset to null and look for it in first line of data

    reader = TabDictReader(sourceFile, fieldnames=columnNames, delimiter=delim);
    columnNames = reader.fieldnames;

    idCol = defaultIDColumn(tableName);
    iIdCol = None;  # Index of manually specified ID column. May be null
    for iCol, colName in enumerate(columnNames):
        if colName == idCol:
            iIdCol = iCol;

    if dateColFormats is not None:
        # Ensure column keys are normalized
        dateCols = dateColFormats.keys();
        for dateCol in dateCols:
            normalCol = normalizeColName(dateCol);
            dateColFormats[normalCol] = dateColFormats[dateCol];

    conn = None;
    if connFactory is not None:
        conn = connFactory.connection();
    else:
        conn = connection()
    cur  = conn.cursor()
    
    try:
        # Prepare the SQL Statement
        sqlParts = []
        sqlParts.append("insert into")
        sqlParts.append( tableName )

        sqlParts.append("(")
        sqlParts.append( str.join(",", columnNames) );
        sqlParts.append(")")

        sqlParts.append("values")
        sqlParts.append("(")
        for i in range(len(columnNames)):
            sqlParts.append( Env.SQL_PLACEHOLDER )    # Parameter placeholder, depends on DB-API
            sqlParts.append(",")
        sqlParts.pop(); # Remove extra end comma
        sqlParts.append(")")

        sql = str.join(" ",sqlParts)

        log.debug(sql)
        

        # Loop through file and execute insert statement everytime find enough delimited parameters.  
        nInserts = 0
        nCols = len(columnNames)
        params = list();
        progress = ProgressDots(total=estInput);
        for iLine, rowModel in enumerate(reader):
            # Parse out data values from strings
            for iCol, colName in enumerate(columnNames):
                value = parseValue(rowModel[colName], colName, dateColFormats, escapeStrings);
                params.append(value);

            log.debug(params)
            try:
                cur.execute(sql,tuple(params))
                nInserts += cur.rowcount

                if idFile != None:
                    rowId = None;
                    if iIdCol is not None:  # Look for manually assigned ID value first
                        rowId = params[iIdCol];
                    else:
                        cur.execute(identityQuery(tableName));
                        rowId = cur.fetchone()[0];
                    print >> idFile, rowId;

                # Need to "auto-commit" after each command, 
                #   otherwise a skipped error will rollback 
                #   any previous commands as well
                if skipErrors: 
                    conn.commit()    

                progress.Update()

            except Exception, err:
                log.info(sql);
                log.info(tuple(params))
                conn.rollback();    # Reset any changes since the last commit
                if skipErrors:
                    log.warning("Error Executing in Script: "+ sql )
                    log.warning(err)
                else:
                    raise;
            params = list();

        conn.commit()

        return nInserts
Exemplo n.º 28
0
def updateFromFile( sourceFile, tableName, columnNames=None, nIdCols=1, delim=None, skipErrors=False, connFactory=None  ):
    """Update the database with the contents of a whitespace-delimited text file.
    
    Updates the contents of the <tableName> with the data from the <sourceFile>.  
    One line is expected in the <sourceFile> per row in the database, with each item 
    delimited by the <delim> character (specify None for any whitespace).  
    These items will be inserted under the respective order of the given list of 
    <columnNames>.  If the columnNames parameter is not provided, assume the
    first line of the <sourceFile> contains the column names.

    To know which rows to update, assume the FIRST column listed in <columnNames> is
    the ID column to identify rows by.  In that case, the data value there from the
    <sourceFile> will not be used to update the row, but will instead be used to
    identify the row to update the rest of the data by.  If more than one column
    is necessary to identify a row (composite key), indicate how many of the
    first columns in <columnNames> should be used with <nIdCols>.  Note that these key ID 
    values must not be None / null.  The query looks for rows where columnname = value,
    and the = operator always returns false when the value is null.

    Returns the total number of rows successfully updated.
    """
    if columnNames is None or len(columnNames) < 1:
        headerLine = sourceFile.readline();
        columnNames = headerLine.split(delim);
    
    conn = None;
    if connFactory is not None:
        conn = connFactory.connection();
    else:
        conn = connection()
    cur  = conn.cursor()

    nCols = len(columnNames);
    
    try:
        # Prepare the SQL Statement
        sql = [];
        sql.append("update");
        sql.append( tableName );
        sql.append("set");

        # Data Columns
        for i in xrange(nIdCols,nCols):
            sql.append(columnNames[i]);
            sql.append("=");
            sql.append(Env.SQL_PLACEHOLDER);
            sql.append(",");
        sql.pop();  # Remove extra comma at end

        # ID Columns
        sql.append("where")
        for i in xrange(nIdCols):
            sql.append(columnNames[i]);
            sql.append("=");
            sql.append(Env.SQL_PLACEHOLDER);
            sql.append("and");
        sql.pop();  # Remove extra comma at end

        sql = str.join(" ",sql);

        log.debug(sql)

        # Loop through file and execute update statement for every line
        progress = ProgressDots()
        for iLine, line in enumerate(sourceFile):
            if not line.startswith(COMMENT_TAG):
                try:
                    line = line[:-1];    # Strip the newline character
                    params = line.split(delim);
                    
                    # Special handling for null / None string
                    for iParam in xrange(len(params)):
                        if params[iParam] == "" or params[iParam] == NULL_STRING:   # Treat blank strings as NULL
                            params[iParam] = None;

                    # Reposition ID columns to end of parameter list
                    idParams = params[:nIdCols];
                    dataParams = params[nIdCols:];
                    paramTuple = dataParams;
                    paramTuple.extend( idParams );
                    paramTuple = tuple(paramTuple);
                    
                    cur.execute(sql, paramTuple);

                    # Need to "auto-commit" after each command, 
                    #   otherwise a skipped error will rollback 
                    #   any previous commands as well
                    if skipErrors:
                        conn.commit()    

                    progress.Update()

                except Exception, err:
                    conn.rollback();    # Reset changes and connection state
                    log.critical(sql);
                    log.critical(paramTuple);
                    log.warning("Error Executing in Script: %s", parameterizeQueryString(sql,paramTuple) );
                    if skipErrors:
                        log.warning(err)
                    else:
                        raise err

        conn.commit()

        return progress.GetCounts();
Exemplo n.º 29
0
    def __call__(self,
                 inputFile,
                 labelCols,
                 valueCols,
                 matchCols,
                 baseLabels=None):
        prog = ProgressDots()

        self.labelCols = labelCols
        self.valueCols = valueCols
        self.matchCols = matchCols
        self.baseLabels = baseLabels

        labelModelByLabelKey = dict()
        dataByLabelKey = dict()

        reader = TabDictReader(inputFile)
        for rowModel in reader:
            labelKey = list()
            labelModel = dict()
            for labelCol in self.labelCols:
                labelModel[labelCol] = rowModel[labelCol]
                labelKey.append(rowModel[labelCol])
            labelKey = tuple(labelKey)
            # Change to immutable object that can be hashed

            # Copy just items of interest
            valueModel = {}
            if self.matchCols:
                for matchCol in self.matchCols:
                    valueModel[matchCol] = rowModel[matchCol]
            for valueCol in self.valueCols:
                try:
                    valueModel[valueCol] = float(rowModel[valueCol])
                except ValueError:  # Maybe None string, could not parse into a number
                    valueModel[valueCol] = None

            if labelKey not in dataByLabelKey:
                labelModelByLabelKey[labelKey] = labelModel
                dataByLabelKey[labelKey] = list()
            dataByLabelKey[labelKey].append(valueModel)

            prog.update()

        # prog.printStatus();

        # Another pass to ensure data is consistently sorted in each group to allow later paired t-tests
        if self.matchCols:
            for labelKey, data in dataByLabelKey.iteritems():
                data.sort(RowItemFieldComparator(self.matchCols))

        # See if looking for only one set of base labeled data to compare the rest against
        baseLabelKey = None
        if self.baseLabels is not None:
            baseLabelKey = tuple(self.baseLabels)

        # Result pass to compare all group pair-wise combinations
        prog = ProgressDots()
        for labelKey0, data0 in dataByLabelKey.iteritems():
            prefix0 = "Group0."
            labelModel0 = labelModelByLabelKey[labelKey0]

            if baseLabelKey is not None and labelKey0 != baseLabelKey:
                continue
                # Skip entries where the base label does not match specified key

            for labelKey1, data1 in dataByLabelKey.iteritems():
                prefix1 = "Group1."
                labelModel1 = labelModelByLabelKey[labelKey1]

                result = dict()
                for labelCol in self.labelCols:
                    result[prefix0 + labelCol] = labelModel0[labelCol]
                    result[prefix1 + labelCol] = labelModel1[labelCol]

                for valueCol in self.valueCols:
                    # Pull out value column for each data group.  Previous, sort by match col to allow paired t-testing
                    # Skip any value pairs if non-numeric / None value
                    values0 = list()
                    values1 = list()
                    for dataItem0, dataItem1 in zip(data0, data1):
                        if dataItem0[valueCol] is not None and dataItem1[
                                valueCol] is not None:
                            values0.append(dataItem0[valueCol])
                            values1.append(dataItem1[valueCol])

                    for summaryFunction in SUMMARY_FUNCTIONS:
                        result[prefix0 + valueCol + "." +
                               summaryFunction.__name__] = summaryFunction(
                                   values0)
                        result[prefix1 + valueCol + "." +
                               summaryFunction.__name__] = summaryFunction(
                                   values1)

                    for compTest in COMPARISON_TESTS:
                        (t, p) = compTest(values0, values1)
                        if np.isnan(p):
                            p = None
                            # Use more generic expression for NaN / null value
                        result[compTest.__name__ + "." + valueCol] = p

                yield result

                prog.update()
Exemplo n.º 30
0
    "TBIL.last": [0,10],
    "K.last": [3,6],
    "Resp.last": [8,30],
    "Temp.last": [97,103],
    "Urine.last": [0,1000],
    "BP_Low_Diastolic.last": [40,80],
    "BP_High_Systolic.last": [80,160],
    "Glasgow.Coma.Scale.Score.last": [5,15],
}
dailyLabChangeRate = 0.1

baselineDNRRate = 0.1

durationRange = [1, 20]

prog = ProgressDots(total=nPatients)
for iPatient in range(nPatients):
    rowData = \
     {
      "patient_id":iPatient,
      "Birth.preTimeDays": random.randint(ageRange[0], ageRange[-1]),
      "income": random.randrange(incomeRange[0], incomeRange[-1], incomeStep),
      "Female.pre": (random.random() < femaleRate)+0,

      "AnyVasoactive.pre": (random.random() < vasoactiveRate)+0, # Add 0 to convert boolean to 0 or 1
      "AnyVentilator.pre": (random.random() < ventilatorRate)+0,
      "AnyCRRT.pre": (random.random() < crrtRate)+0,

      "Charlson.Cerebrovascular.pre": (random.random() < comorbidityRate)+0,
      "Charlson.CHF.pre": (random.random() < comorbidityRate)+0,
      "Charlson.COPD.pre": (random.random() < comorbidityRate)+0,