def queryItems(self, options, outputFile): """Query for all clinical item records that fulfill the options criteria and then send the results as tab-delimited output to the outputFile. """ pauseSeconds = float(options.pauseSeconds) query = SQLQuery() query.addSelect("count(order_med_id_coded) as nOrders") query.addSelect("om.med_route, om.medication_id, om.med_description") query.addFrom("starr_datalake2018.order_med as om") if options.descriptionPrefix: query.addWhereOp("om.med_description", "like", options.descriptionPrefix + "%%") # Add wildcard to enabe prefix search if options.medRoutes: query.addWhereIn("om.med_route", options.medRoutes.split(",")) query.addGroupBy("om.medication_id, om.med_description, om.med_route") query.addOrderBy("nOrders desc, om.med_description") formatter = TextResultsFormatter(outputFile) prog = ProgressDots() for row in DBUtil.execute(query, includeColumnNames=True, connFactory=self.connFactory): formatter.formatTuple(row) time.sleep(pauseSeconds) prog.update() prog.printStatus()
def main(nResamples = 1000, sampleSize = 200, positiveCount = 180): confidenceInterval = 0.95 sample = list() for i in range(sampleSize): value = 0; if i < positiveCount: value = 1; sample.append(value); sample = np.array(sample); print("Sample Rate: ",calcStat(sample) ); rng = np.random.RandomState(nResamples); # Seed with number for consistency bootstrapValues = list(); prog = ProgressDots(); for i in range(nResamples): indices = rng.random_integers(0, sampleSize-1, sampleSize); # Get indices of resample items (with replacement) resample = sample[indices]; bootstrapValues.append( calcStat(resample) ); prog.update(); ciFractionLow = (1.0-confidenceInterval) / 2; ciFractionHigh= confidenceInterval + ciFractionLow; bootstrapValues.sort(); print("Sample Rate 95%CI Low", bootstrapValues[int(ciFractionLow*nResamples)] ) print("Sample Rate 95%CI High", bootstrapValues[int(ciFractionHigh*nResamples)] )
def convertSourceItems(self, convOptions, conn=None): """Primary run function to process the contents of the raw source table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeated conversions. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate)) progress = ProgressDots() extConn = conn is not None if not extConn: conn = self.connFactory.connection() try: # Next round for medications directly from order_med table not addressed in medmix TODO (nodir) seems like an unrelated comment? category = self.categoryFromSourceItem(conn) for sourceItem in self.querySourceItems(convOptions): log.debug('sourceItem: {}'.format(sourceItem)) self.convertSourceItem(category, sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def parseLabResultsFile(labFile): log.info("Parse lab results file"); prog = ProgressDots(); labsByBaseNameByPatientId = dict(); # Dictionary of dictionaries of lists of result items for labResult in TabDictReader(labFile): if labResult["ord_num_value"] is not None and labResult["ord_num_value"] != NULL_STRING: patientId = int(labResult["pat_id"]); labBaseName = labResult["base_name"]; resultValue = float(labResult["ord_num_value"]); resultTime = DBUtil.parseDateValue(labResult["result_time"]); if resultValue < LAB_SENTINEL_VALUE: # Skip apparent placeholder values labResult["pat_id"] = labResult["patient_id"] = patientId; labResult["ord_num_value"] = resultValue; labResult["result_time"] = resultTime; if patientId not in labsByBaseNameByPatientId: labsByBaseNameByPatientId[patientId] = dict(); if labBaseName not in labsByBaseNameByPatientId[patientId]: labsByBaseNameByPatientId[patientId][labBaseName] = list(); labsByBaseNameByPatientId[patientId][labBaseName].append( labResult ); prog.update(); prog.printStatus(); return labsByBaseNameByPatientId;
def queryItems(self, options, outputFile): """Query for all clinical item records that fulfill the options criteria and then send the results as tab-delimited output to the outputFile. """ pauseSeconds = float(options.pauseSeconds) query = SQLQuery() query.addSelect( "cic.description, ci.clinical_item_id, ci.name, ci.description") query.addFrom("clinical_item_category as cic") query.addFrom("clinical_item as ci") query.addWhere( "cic.clinical_item_category_id = ci.clinical_item_category_id") if options.itemPrefix: query.addWhereOp("ci.description", "like", options.itemPrefix + "%%") # Add wildcard to enabe prefix search if options.categoryNames: query.addWhereIn("cic.description", options.categoryNames.split(",")) query.addOrderBy( "cic.description, ci.name, ci.description, ci.clinical_item_id") formatter = TextResultsFormatter(outputFile) prog = ProgressDots() for row in DBUtil.execute(query, includeColumnNames=True, connFactory=self.connFactory): formatter.formatTuple(row) time.sleep(pauseSeconds) prog.update() prog.printStatus()
def convertSourceItems(self, convOptions): """Primary run function to process the contents of the order_med table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeatEd conversions. startDate - If provided, only return items whose order_time_jittered is on or after that date. endDate - If provided, only return items whose order_time_jittered is before that date. """ log.info("Conversion for items dated {} to {}".format( convOptions.startDate, convOptions.endDate)) progress = ProgressDots() conn = self.connFactory.connection() try: # Load up the medication mapping table to facilitate subsequent conversions rxcuiDataByMedId = self.loadRXCUIData() # Next round for medications directly from order_med table not addressed in medmix for sourceItem in self.querySourceItems(rxcuiDataByMedId, convOptions, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def analyzePatientItems(self, analysisOptions): """Primary run function to analyze patient clinical item data and record updated stats to the respective database tables. Does the analysis only for records pertaining to the given patient IDs (provides a way to limit the extent of analysis depending on params). Will also record analyze_date timestamp on any records analyzed, so that analysis will not be repeated if called again on the same records. """ progress = ProgressDots(); conn = self.connFactory.connection(); try: # Preload lookup data to facilitate rapid checks and filters later linkedItemIdsByBaseId = self.dataManager.loadLinkedItemIdsByBaseId(conn=conn); # Keep an in memory buffer of the updates to be done so can stall and submit them # to the database in batch to minimize inefficient DB hits updateBuffer = self.makeUpdateBuffer(); log.info("Main patient item query...") for iPatient, patientItemList in enumerate(self.queryPatientItemsPerPatient(analysisOptions, progress=progress, conn=conn)): log.debug("Calculate associations for Patient %d's %d patient items. %d associations in buffer." % (iPatient, len(patientItemList), updateBuffer["nAssociations"]) ); self.updateItemAssociationsBuffer(patientItemList, updateBuffer, analysisOptions, linkedItemIdsByBaseId, progress=progress); if self.readyForIntervalCommit(iPatient, updateBuffer, analysisOptions): log.info("Commit after %s patients" % (iPatient+1) ); self.persistUpdateBuffer(updateBuffer, linkedItemIdsByBaseId, analysisOptions, iPatient, conn=conn); # Periodically commit update buffer else: # If not committing, still send a quick arbitrary query to DB, # otherwise connection may get recycled because DB thinks timeout with no interaction DBUtil.execute("select 1+1", conn=conn); log.info("Final commit / persist"); self.persistUpdateBuffer(updateBuffer, linkedItemIdsByBaseId, analysisOptions, -1, conn=conn); # Final update buffer commit. Don't use iPatient here, as may collide if interval commit happened to land on last patient finally: conn.close();
def addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, indexItemBaseName, labBaseNames, labPreTime, labPostTime): log.info("Sort lab results by result time for each patient and find items within specified time period to aggregate"); prog = ProgressDots(); for iPatient, (patientId, labsByBaseName) in enumerate(labsByBaseNameByPatientId.iteritems()): # Look for the first result of the index item (ferritin) indexItem = None; if indexItemBaseName in labsByBaseName: for labResult in labsByBaseName[indexItemBaseName]: if indexItem is None or labResult["result_time"] < indexItem["result_time"]: indexItem = labResult; if indexItem is not None: # Skip this patient if no index item found, should not be possible since pre-screened for relevant patients indexTime = indexItem["result_time"]; patient = patientById[patientId]; patient["index_time"] = indexTime; preTimeLimit = indexTime+labPreTime; postTimeLimit = indexTime+labPostTime; # Init values for each lab of interest to an empty list for labBaseName in labBaseNames: # Default to null for all values patient["%s.min" % labBaseName] = None; patient["%s.max" % labBaseName] = None; patient["%s.median" % labBaseName] = None; patient["%s.mean" % labBaseName] = None; patient["%s.std" % labBaseName] = None; patient["%s.first" % labBaseName] = None; patient["%s.last" % labBaseName] = None; patient["%s.proximate" % labBaseName] = None; proximateValue = None; if labBaseName in labsByBaseName: # Not all patients will have all labs checked proximateItem = None; # Item closest to the index item in time valueList = list(); for labResult in labsByBaseName[labBaseName]: resultTime = labResult["result_time"]; if preTimeLimit <= resultTime and resultTime < postTimeLimit: # Occurs within time frame of interest, so record this value valueList.append(labResult["ord_num_value"]); if proximateItem is None or (abs(resultTime-indexTime) < abs(proximateItem["result_time"]-indexTime)): proximateItem = labResult; proximateValue = proximateItem["ord_num_value"]; if len(valueList) > 0: patient["%s.min" % labBaseName] = np.min(valueList); patient["%s.max" % labBaseName] = np.max(valueList); patient["%s.median" % labBaseName] = np.median(valueList); patient["%s.mean" % labBaseName] = np.mean(valueList); patient["%s.std" % labBaseName] = np.std(valueList); patient["%s.first" % labBaseName] = valueList[0]; # Assumes previously sorted patient["%s.last" % labBaseName] = valueList[-1]; # by result_time patient["%s.proximate" % labBaseName] = proximateValue; prog.update(); colNames.extend(colsFromLabBaseNames(labBaseNames)); prog.printStatus();
def generatePatientItemsForCompositeId(self, clinicalItemIds, compositeId, conn=None): """Create patient_item records for the composite to match the given clinical item ID patient items. """ extConn = True if conn is None: conn = self.connFactory.connection() extConn = False try: # Record linking information for componentId in clinicalItemIds: linkModel = RowItemModel() linkModel["clinical_item_id"] = compositeId linkModel["linked_item_id"] = componentId insertQuery = DBUtil.buildInsertQuery("clinical_item_link", linkModel.keys()) insertParams = linkModel.values() DBUtil.execute(insertQuery, insertParams, conn=conn) # Extract back link information, which will also flatten out any potential inherited links linkedItemIdsByBaseId = self.loadLinkedItemIdsByBaseId(conn=conn) linkedItemIds = linkedItemIdsByBaseId[compositeId] # Create patienItem records for the composite clinical item to overlap existing component ones # First query for the existing component records query = SQLQuery() query.addSelect("*") query.addFrom("patient_item") query.addWhereIn("clinical_item_id", linkedItemIds) results = DBUtil.execute(query, includeColumnNames=True, conn=conn) patientItems = modelListFromTable(results) # Patch component records to instead become composite item records then insert back into database progress = ProgressDots(total=len(patientItems)) for patientItem in patientItems: del patientItem["patient_item_id"] patientItem["clinical_item_id"] = compositeId patientItem["analyze_date"] = None insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) except conn.IntegrityError, err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.info(err) progress.Update() # progress.PrintStatus(); finally: if not extConn: conn.close()
def buildModel(self, corpusBOWGenerator, numTopics): """Build topic model from corpus (interpret as generator over contents) Given the bag-of-words corpus, build a docCountByWordId count dictionary to facilitate subsequent Term Frequency * Inverse DOCUMENT FREQUENCY calculations. In Clinical context, document = patient. Return (model, docCountByWordId); """ # Load dictionary to translate item IDs to descriptions itemsById = DBUtil.loadTableAsDict("clinical_item") id2word = dict() # Models expect a pair for every possible item ID, and judges vocabulary size by length of this dictionary rather than the maximum ID values. That means have to populate all of the empty ones as well. maxId = max(itemsById.keys()) for itemId in range(maxId + 1): description = str(itemId) # Default to just the same as the ID string if itemId in itemsById: description = itemsById[itemId]["description"] id2word[itemId] = description # Stream in progressive updates from corpus generator so don't have to load all into memory # Do a batch of many at a time, otherwise very slow to increment one at a time docBuffer = list() prog = ProgressDots() self.model = None self.docCountByWordId = { None: 0 } # Use None key to represent count of all documents for i, document in enumerate(corpusBOWGenerator): for ( wordId, wordCount ) in document: # Assuming uniqueness of wordId keys for each document if wordId not in self.docCountByWordId: self.docCountByWordId[wordId] = 0 self.docCountByWordId[wordId] += 1 self.docCountByWordId[None] += 1 docBuffer.append(document) if i % BUFFER_UPDATE_SIZE == ( BUFFER_UPDATE_SIZE - 1): # Update model with current buffer of documents self.model = self.updateModel(self.model, docBuffer, id2word, numTopics) docBuffer = list() # Discard committed buffer prog.update() self.model = self.updateModel(self.model, docBuffer, id2word, numTopics) # Last update for any remaining documents in buffer docBuffer = list() # Discard committed buffer # prog.printStatus(); return (self.model, self.docCountByWordId)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog -p <pid> -c <commandStr>\n" parser = OptionParser(usage=usageStr) parser.add_option( "-p", "--pid", dest="pid", help= "Process ID to monitor. As soon as it is no longer found by Unix signal (os.kill(pid,0)), will proceed to execute the given command" ) parser.add_option( "-i", "--interval", dest="interval", default="1", help= "How many seconds to wait before checking if the PID is still active. Default to 1 second." ) parser.add_option( "-c", "--commandStr", dest="commandStr", help= "Command string to execute after the specified PID is longer found. Will just pass whole string to subprocess" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() ###########---From medinfo/common/support/awaitProcess.py)---############# if options.pid and options.commandStr: pid = int(options.pid) interval = float(options.interval) prog = ProgressDots(60, 1, "intervals") while pidExists(pid): time.sleep(interval) prog.update() # prog.printStatus(); print >> sys.stderr, "Executing: ", options.commandStr process = subprocess.Popen(options.commandStr) print >> sys.stderr, "Started process: ", process.pid else: parser.print_help() sys.exit(-1) timer = time.time() - timer print >> sys.stderr, ("%.3f seconds to complete" % timer) ###---END awaitProcess method---### timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def analyzePatientItems(self, patientIds, itemIdSequence, virtualItemId): """Primary run function to analyze patient clinical item data and record updated stats to the respective database tables. Does the analysis only for records pertaining to the given patient IDs (provides a way to limit the extent of analysis depending on params). Note that this does NOT record analyze_date timestamp on any records analyzed, as would collide with AssociationAnalysis primary timestamping, thus it is the caller's responsibility to be careful not to repeat this analysis redundantly and generating duplicated statistics. """ progress = ProgressDots() conn = self.connFactory.connection() try: # Preload lookup data to facilitate rapid checks and filters later linkedItemIdsByBaseId = self.dataManager.loadLinkedItemIdsByBaseId( conn=conn) self.verifyVirtualItemLinked(itemIdSequence, virtualItemId, linkedItemIdsByBaseId, conn=conn) # Keep an in memory buffer of the updates to be done so can stall and submit them # to the database in batch to minimize inefficient DB hits updateBuffer = dict() log.info("Main patient item query...") analysisOptions = AnalysisOptions() analysisOptions.patientIds = patientIds for iPatient, patientItemList in enumerate( self.queryPatientItemsPerPatient(analysisOptions, progress=progress, conn=conn)): log.debug( "Calculate associations for Patient %d's %d patient items" % (iPatient, len(patientItemList))) self.updateItemAssociationsBuffer(itemIdSequence, virtualItemId, patientItemList, updateBuffer, linkedItemIdsByBaseId, progress=progress) # Periodically send a quick arbitrary query to DB, otherwise connection may get recycled because DB thinks timeout with no interaction DBUtil.execute("select 1+1", conn=conn) log.info("Final commit") self.commitUpdateBuffer(updateBuffer, linkedItemIdsByBaseId, conn=conn) # Final update buffer commit finally: conn.close()
def __call__(self, analysisQuery, conn=None): extConn = True; if conn is None: conn = self.connFactory.connection(); extConn = False; try: # Preload some lookup data to facilitate subsequent checks categoryIdByItemId = dict(); lookupTable = DBUtil.execute("select clinical_item_id, clinical_item_category_id from clinical_item", conn=conn); for (clinicalItemId, categoryId) in lookupTable: categoryIdByItemId[clinicalItemId] = categoryId; # Recommender to test with recommender = analysisQuery.recommender; # Start building basic recommendation query to use for testing recQuery = analysisQuery.baseRecQuery; # Start building results data resultsTable = list(); progress = ProgressDots(50,1,"Item Recommendations"); # Query for all of the order / item data for the test patients. Load one patient's data at a time for (patientId, clinicalItemIdList) in self.queryPatientClinicalItemData(analysisQuery, conn=conn): # Parse through the patient's item list and run serial recommendation queries # to find each item's accumulated recommendation rank serialRecDataGen = \ self.reviewSerialRecommendations \ ( patientId, clinicalItemIdList, analysisQuery, recQuery, recommender, categoryIdByItemId, progress=progress, conn=conn ); for (clinicalItemId, iItem, iRecItem, recRank, recScore) in serialRecDataGen: resultsRow = (patientId, clinicalItemId, iItem, iRecItem, recRank, recScore); resultsTable.append(resultsRow); # progress.PrintStatus(); #print resultsTable return resultsTable; finally: if not extConn: conn.close();
def convertSourceItems(self, patientIds=None): """Primary run function to process the contents of the starr_datalake2018.demographic table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies to avoid repeating conversion. patientIds - If provided, only process items for patient IDs matching those provided """ log.info("Conversion for patients starting with: %s, %s total" % (patientIds[:5], len(patientIds))) progress = ProgressDots() with self.connFactory.connection() as conn: category_model = self.categoryFromSourceItem(conn) # only 1 category - no need to have it in the loop for sourceItem in self.querySourceItems(patientIds, progress): self.convertSourceItem(category_model, sourceItem, conn)
def convertSourceItems(self, patientIds=None): """Primary run function to process the contents of the stride_patient table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies to avoid repeating conversion. patientIds - If provided, only process items for patient IDs matching those provided """ log.info("Conversion for patients starting with: %s, %s total" % (patientIds[:5], len(patientIds))) progress = ProgressDots() with self.connFactory.connection() as conn: for sourceItem in self.querySourceItems(patientIds, progress=progress): self.convertSourceItem(sourceItem, conn=conn)
def convertSourceItems(self, userSIDs=None, limit=None, offset=None): """Primary run function to process the contents of the source table and convert them into normalized data table entries. """ log.info("Conversion for patients: %s" % userSIDs); progress = ProgressDots(); conn = self.connFactory.connection(); try: for i, sourceItem in enumerate(self.querySourceItems(userSIDs, limit, offset, progress=progress, conn=conn)): self.convertSourceItem(sourceItem, conn=conn); progress.Update(); # Go through accumulated metric description lines into single entries for the metric table self.updateMetricDescriptionLines(); finally: conn.close();
def convertSourceItems(self, patientIds=None): """Primary run function to process the contents of the stride_patient table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies to avoid repeating conversion. patientIds - If provided, only process items for patient IDs matching those provided """ log.info("Conversion for patients: %s" % patientIds) progress = ProgressDots() conn = self.connFactory.connection() try: for sourceItem in self.querySourceItems(patientIds, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn) finally: conn.close()
def convertSourceItems(self, startDate=None, endDate=None): """Primary run function to process the contents of the source table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies to avoid repeating conversion. startDate - If provided, only return items whose noted_date is on or after that date. endDate - If provided, only return items whose noted_date is before that date. """ log.info("Conversion for items dated %s to %s" % (startDate, endDate)); progress = ProgressDots(); conn = self.connFactory.connection(); try: for sourceItem in self.querySourceItems(startDate, endDate, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn); finally: conn.close(); progress.PrintStatus();
def main(argv): conversionProcessor = STRIDEOrderResultsConversion(); conn = DBUtil.connection(); try: # Pull out list of result names to look for that are not already in the calculated nameTable = DBUtil.execute("select name from sim_result except select base_name from order_result_stat", conn=conn); prog = ProgressDots(big=1,small=1,total=len(nameTable)); for row in nameTable: baseName = row[0]; print("Calculating Stats for %s" % baseName, file=sys.stderr); statModel = conversionProcessor.calculateResultStats( baseName, conn=conn ); DBUtil.insertRow("order_result_stat", statModel, conn=conn ); prog.update(); prog.printStatus(); conn.commit(); finally: conn.close();
def parseClinicalItemFile(itemFile, patientIdCol="patient_id", timeCol="item_date"): prog = ProgressDots(); itemTimesByPatientId = dict(); for itemData in TabDictReader(itemFile): patientId = int(itemData[patientIdCol]); itemTime = DBUtil.parseDateValue(itemData[timeCol]); itemData[patientIdCol] = patientId; itemData[timeCol] = itemTime; if patientId not in itemTimesByPatientId: itemTimesByPatientId[patientId] = list(); itemTimesByPatientId[patientId].append( itemTime ); prog.update(); prog.printStatus(); return itemTimesByPatientId;
def __call__(self, inputFiles): """Return generator over dictionary objects representing the concatenated contents of the input files after adding and accounting for argv parameter columns. """ # Consolidate a master set of all column headers to use self.colNames = list() # Keep track of consistent order found colSet = set() # Keep track of unique items # Pull out any header comments that may represent an argv list to parse # Pull out header row with column labels for each input file argvDicts = list() # Dictionary for each input file keyed by argv parameter name with respective value readers = list() # TabDictReader from which header columns can be accessed as fieldnames for inputFile in inputFiles: reader = TabDictReader(inputFile) readers.append(reader) for col in reader.fieldnames: if col not in colSet: colSet.add(col) self.colNames.append(col) argvDict = self.extract_argvDict(reader.commentLines) # Must be called after reader.fieldnames so initial text parsing will start argvDicts.append(argvDict) for col in argvDict.iterkeys(): if col not in colSet: colSet.add(col) self.colNames.append(col) prog = ProgressDots(50, 1, "Files") # Now generate each file in succession, but "outer-joined" to include the master column header list for argvDict, reader in zip(argvDicts, readers): for resultDict in reader: resultDict.update(argvDict) for col in self.colNames: if col not in resultDict: resultDict[col] = None yield resultDict prog.update()
def convertSourceItems(self, convOptions): """Primary run function to process the contents of the stride_order_med table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeatEd conversions. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate)) progress = ProgressDots() conn = self.connFactory.connection() try: # Load up the medication mapping table to facilitate subsequent conversions rxcuiDataByMedId = self.loadRXCUIData(conn=conn) # Keep track of which order meds have already been converted based on mixture components (don't repeat for the aggregate order then) # Can be a lot to store in local memory for large conversions, so may need to batch smaller sub-processes convertedOrderMedIds = set() # First round for medication combinations that must be extracted from order_medmixinfo table for sourceItem in self.queryMixSourceItems(rxcuiDataByMedId, convOptions, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn) convertedOrderMedIds.add(sourceItem["order_med_id"]) progress.Update() # Next round for medications directly from order_med table not addressed in medmix for sourceItem in self.querySourceItems(rxcuiDataByMedId, convOptions, progress=progress, conn=conn): if sourceItem[ "order_med_id"] not in convertedOrderMedIds: # Don't repeat conversion if mixture components already addressed self.convertSourceItem(sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def convertSourceItems(self, convOptions): """Primary run function to process the contents of the raw source table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeated conversions. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate)); progress = ProgressDots(); conn = self.connFactory.connection(); try: # Next round for medications directly from order_med table not addressed in medmix for sourceItem in self.querySourceItems(convOptions, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn); progress.Update(); finally: conn.close(); progress.PrintStatus();
def main_quickTest(argv): modelFilename = argv[1] modeler = TopicModel() timer = time.time() (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename) timer = time.time() - timer log.info("%.2f seconds to load", timer) timer = time.time() weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId( model, 100) timer = time.time() - timer log.info("%.2f seconds to generate weights", timer) for i in xrange(3): prog = ProgressDots() for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems(): for (itemId, itemWeight) in weightByItemId.iteritems(): prog.update() prog.printStatus() """
def execute( query, parameters=None, includeColumnNames=False, incTypeCodes=False, formatter=None, conn=None, connFactory=None, autoCommit=True): """Execute a single SQL query / command against the database. If the description attribute is not None, this implies this was a select statement that produced a result set which will be returned by the fetchall() method. If the description is null, then at least return the rowcount affected by the query. This may be -1 or None still if it is a non-row affecting command (e.g. create / drop). If includeColumnNames is true and the query yields a result set, then one row (list) will be added to the beginning which contains the names of each column as extracted from the cursor.description. If incTypeCodes is true and the query yields a result set, a row (list) will be added to the beginning (but after column names if those are included as well), which contains the numerical type codes of each column as extracted from the cursor.description. This method is probably not terribly efficient and should only be used for prototype testing and short command line functions. For retrieving data to send to stdout or some other stream, add the formatter parameter as an instance of a ResultFormatter object to pipe the data through one fetch at a time. In that case, the full results (which are presumably large) will NOT be returned by the method. If the query object is actually a SQLQuery object, then will use the SQLQuery.getParams() as the params, and str(SQLQuery) as the query string. If autoCommit is True, will autoCommit. The function will also autoCommit if an external connection is NOT supplied. """ # Look for an explicitly specified external connection extConn = conn is not None if conn is None: # If no specific connection object provided, look for a connection factory # to produce one if connFactory is not None: conn = connFactory.connection() else: # No connection or factory specified, just fall back on default connection then conn = connection() cur = conn.cursor() if isinstance(query, SQLQuery): if parameters is None: parameters = tuple(query.getParams()) else: parameters = tuple(parameters) query = str(query) elif parameters is None: parameters = () #log.debug(parameterizeQueryString(query,parameters)); returnValue = None try: timer = time.time(); try: cur.execute( query, parameters ) except Exception, err: log.error(err); #log.error(parameterizeQueryString(query,parameters)); if (not extConn) or autoCommit: conn.rollback(); raise; timer = time.time() - timer; log.debug("Query Time: (%1.3f sec)" % timer ); if cur.description != None: returnValue = [] colNames = None; if includeColumnNames: colNames = columnNamesFromCursor(cur); returnValue.append(colNames) if incTypeCodes: typeCodes = typeCodesFromCursor(cur); returnValue.append(typeCodes); if formatter != None: # An output formatter was specified, pipe the data out one row at time if includeColumnNames: formatter.formatTuple(colNames) progress = ProgressDots(); row = cur.fetchone() while row != None: formatter.formatTuple(row) row = cur.fetchone() progress.Update(); log.info("%d Rows Completed",progress.GetCounts()); returnValue = cur.rowcount else: # No formatter specified, just return the entire result set dataTable = list(cur.fetchall()); for i, row in enumerate(dataTable): dataTable[i] = list(row); returnValue.extend(dataTable); else: returnValue = cur.rowcount if (not extConn) or autoCommit: conn.commit()
'LABNTBNP', 'LABOSM', 'LABPALB', 'LABPCCG4O', 'LABPCCR', 'LABPCTNI', 'LABPHOS', 'LABPLTS', 'LABPROCT', 'LABPT', 'LABPTEG', 'LABPTT', 'LABRESP', 'LABRESPG', 'LABRETIC', 'LABSPLAC', 'LABSTLCX', 'LABSTOBGD', 'LABTNI', 'LABTRFS', 'LABTRIG', 'LABTSH', 'LABUCR', 'LABUOSM', 'LABUA', 'LABUAPRN', 'LABUPREG', 'LABURIC', 'LABURNA', 'LABURNC', 'LABUSPG' ] labs_to_test = ["LABLACWB"] # [#"LABMGN, #"LABPTT", #"LABPHOS", "LABTNI", #"LABLACWB", # #"LABA1C", #"LABHEPAR", #"LABPLTS", "LABLAC","LABLIPS", # "LABTSH", "LABHCTX", "LABLDH", "LABK", "LABNTBNP", # "LABCRP", "LABCK", #"LABFER", # "LABUSPG"] baseArgv = \ ["python","LabChangePredictionPipeline.py"] prog = ProgressDots(1, 1, "Processes", total=len(labs_to_test)) for lab_name in labs_to_test: key = "%.3d.%s" % (prog.getCounts(), lab_name) key = key.replace( "/", "..") # Don't want to use directory separator in temp log file name argv = list(baseArgv) argv.append(lab_name) log.info("Starting: " + str.join(" ", argv)) logFile = stdOpen(LOG_FILE_TEMPLATE % key, "w") p = subprocess.Popen(argv, stderr=logFile) # Blocking sub-process call. Can just run multiple instances to parallelize, or consider subprocess.Popen objects instead
def insertFile( sourceFile, tableName, columnNames=None, delim=None, idFile=None, skipErrors=False, dateColFormats=None, escapeStrings=False, estInput=None, connFactory=None ): """Insert the contents of a whitespace-delimited text file into the database. For PostgreSQL specifically, consider alternative direct COPY command that can run 10x: E.g., gzip -d -c TTeam_2014.tsv.gz | psql -U jonc101 -c "COPY tteamx ( pat_deid, enc_deid, relation, prov_id, prov_name, start_date, end_date ) FROM STDIN WITH (FORMAT csv, DELIMITER E'\t', HEADER, NULL 'None');" resident-access-log-2017 Inserts the contents of the <sourceFile> into the database under the <tableName>. One line is expected in the <sourceFile> per row in the database, with each item delimited by the <delim> character. These items will be inserted under the respective order of the given list of columnNames. Use the built-in csv module for parsing out lines and managing quotes, etc. If delimiter is not specified (None), then default to tab-delimited If idFile is provided, then will try to run SQL from identityQuery method after each insert, and write out the contents, one per line to the idFile. Will bypass above step if can find an insert column with the expected default ID column ("tableName_id") If dateColFormats provided, expect a dictionary keyed by the names of columns that should be as interpreted date strings, with values equal to the Python date format string to parse them by. If a format string is not provided, a series of standard date format strings will be attempted (but this is inefficient for repeated date text parsing and error handling). Returns the total number of rows successfully inserted. """ if columnNames is not None and len(columnNames) < 1: columnNames = None; # If empty columnNames list, then reset to null and look for it in first line of data reader = TabDictReader(sourceFile, fieldnames=columnNames, delimiter=delim); columnNames = reader.fieldnames; idCol = defaultIDColumn(tableName); iIdCol = None; # Index of manually specified ID column. May be null for iCol, colName in enumerate(columnNames): if colName == idCol: iIdCol = iCol; if dateColFormats is not None: # Ensure column keys are normalized dateCols = dateColFormats.keys(); for dateCol in dateCols: normalCol = normalizeColName(dateCol); dateColFormats[normalCol] = dateColFormats[dateCol]; conn = None; if connFactory is not None: conn = connFactory.connection(); else: conn = connection() cur = conn.cursor() try: # Prepare the SQL Statement sqlParts = [] sqlParts.append("insert into") sqlParts.append( tableName ) sqlParts.append("(") sqlParts.append( str.join(",", columnNames) ); sqlParts.append(")") sqlParts.append("values") sqlParts.append("(") for i in range(len(columnNames)): sqlParts.append( Env.SQL_PLACEHOLDER ) # Parameter placeholder, depends on DB-API sqlParts.append(",") sqlParts.pop(); # Remove extra end comma sqlParts.append(")") sql = str.join(" ",sqlParts) log.debug(sql) # Loop through file and execute insert statement everytime find enough delimited parameters. nInserts = 0 nCols = len(columnNames) params = list(); progress = ProgressDots(total=estInput); for iLine, rowModel in enumerate(reader): # Parse out data values from strings for iCol, colName in enumerate(columnNames): value = parseValue(rowModel[colName], colName, dateColFormats, escapeStrings); params.append(value); log.debug(params) try: cur.execute(sql,tuple(params)) nInserts += cur.rowcount if idFile != None: rowId = None; if iIdCol is not None: # Look for manually assigned ID value first rowId = params[iIdCol]; else: cur.execute(identityQuery(tableName)); rowId = cur.fetchone()[0]; print >> idFile, rowId; # Need to "auto-commit" after each command, # otherwise a skipped error will rollback # any previous commands as well if skipErrors: conn.commit() progress.Update() except Exception, err: log.info(sql); log.info(tuple(params)) conn.rollback(); # Reset any changes since the last commit if skipErrors: log.warning("Error Executing in Script: "+ sql ) log.warning(err) else: raise; params = list(); conn.commit() return nInserts
def updateFromFile( sourceFile, tableName, columnNames=None, nIdCols=1, delim=None, skipErrors=False, connFactory=None ): """Update the database with the contents of a whitespace-delimited text file. Updates the contents of the <tableName> with the data from the <sourceFile>. One line is expected in the <sourceFile> per row in the database, with each item delimited by the <delim> character (specify None for any whitespace). These items will be inserted under the respective order of the given list of <columnNames>. If the columnNames parameter is not provided, assume the first line of the <sourceFile> contains the column names. To know which rows to update, assume the FIRST column listed in <columnNames> is the ID column to identify rows by. In that case, the data value there from the <sourceFile> will not be used to update the row, but will instead be used to identify the row to update the rest of the data by. If more than one column is necessary to identify a row (composite key), indicate how many of the first columns in <columnNames> should be used with <nIdCols>. Note that these key ID values must not be None / null. The query looks for rows where columnname = value, and the = operator always returns false when the value is null. Returns the total number of rows successfully updated. """ if columnNames is None or len(columnNames) < 1: headerLine = sourceFile.readline(); columnNames = headerLine.split(delim); conn = None; if connFactory is not None: conn = connFactory.connection(); else: conn = connection() cur = conn.cursor() nCols = len(columnNames); try: # Prepare the SQL Statement sql = []; sql.append("update"); sql.append( tableName ); sql.append("set"); # Data Columns for i in xrange(nIdCols,nCols): sql.append(columnNames[i]); sql.append("="); sql.append(Env.SQL_PLACEHOLDER); sql.append(","); sql.pop(); # Remove extra comma at end # ID Columns sql.append("where") for i in xrange(nIdCols): sql.append(columnNames[i]); sql.append("="); sql.append(Env.SQL_PLACEHOLDER); sql.append("and"); sql.pop(); # Remove extra comma at end sql = str.join(" ",sql); log.debug(sql) # Loop through file and execute update statement for every line progress = ProgressDots() for iLine, line in enumerate(sourceFile): if not line.startswith(COMMENT_TAG): try: line = line[:-1]; # Strip the newline character params = line.split(delim); # Special handling for null / None string for iParam in xrange(len(params)): if params[iParam] == "" or params[iParam] == NULL_STRING: # Treat blank strings as NULL params[iParam] = None; # Reposition ID columns to end of parameter list idParams = params[:nIdCols]; dataParams = params[nIdCols:]; paramTuple = dataParams; paramTuple.extend( idParams ); paramTuple = tuple(paramTuple); cur.execute(sql, paramTuple); # Need to "auto-commit" after each command, # otherwise a skipped error will rollback # any previous commands as well if skipErrors: conn.commit() progress.Update() except Exception, err: conn.rollback(); # Reset changes and connection state log.critical(sql); log.critical(paramTuple); log.warning("Error Executing in Script: %s", parameterizeQueryString(sql,paramTuple) ); if skipErrors: log.warning(err) else: raise err conn.commit() return progress.GetCounts();
def __call__(self, inputFile, labelCols, valueCols, matchCols, baseLabels=None): prog = ProgressDots() self.labelCols = labelCols self.valueCols = valueCols self.matchCols = matchCols self.baseLabels = baseLabels labelModelByLabelKey = dict() dataByLabelKey = dict() reader = TabDictReader(inputFile) for rowModel in reader: labelKey = list() labelModel = dict() for labelCol in self.labelCols: labelModel[labelCol] = rowModel[labelCol] labelKey.append(rowModel[labelCol]) labelKey = tuple(labelKey) # Change to immutable object that can be hashed # Copy just items of interest valueModel = {} if self.matchCols: for matchCol in self.matchCols: valueModel[matchCol] = rowModel[matchCol] for valueCol in self.valueCols: try: valueModel[valueCol] = float(rowModel[valueCol]) except ValueError: # Maybe None string, could not parse into a number valueModel[valueCol] = None if labelKey not in dataByLabelKey: labelModelByLabelKey[labelKey] = labelModel dataByLabelKey[labelKey] = list() dataByLabelKey[labelKey].append(valueModel) prog.update() # prog.printStatus(); # Another pass to ensure data is consistently sorted in each group to allow later paired t-tests if self.matchCols: for labelKey, data in dataByLabelKey.iteritems(): data.sort(RowItemFieldComparator(self.matchCols)) # See if looking for only one set of base labeled data to compare the rest against baseLabelKey = None if self.baseLabels is not None: baseLabelKey = tuple(self.baseLabels) # Result pass to compare all group pair-wise combinations prog = ProgressDots() for labelKey0, data0 in dataByLabelKey.iteritems(): prefix0 = "Group0." labelModel0 = labelModelByLabelKey[labelKey0] if baseLabelKey is not None and labelKey0 != baseLabelKey: continue # Skip entries where the base label does not match specified key for labelKey1, data1 in dataByLabelKey.iteritems(): prefix1 = "Group1." labelModel1 = labelModelByLabelKey[labelKey1] result = dict() for labelCol in self.labelCols: result[prefix0 + labelCol] = labelModel0[labelCol] result[prefix1 + labelCol] = labelModel1[labelCol] for valueCol in self.valueCols: # Pull out value column for each data group. Previous, sort by match col to allow paired t-testing # Skip any value pairs if non-numeric / None value values0 = list() values1 = list() for dataItem0, dataItem1 in zip(data0, data1): if dataItem0[valueCol] is not None and dataItem1[ valueCol] is not None: values0.append(dataItem0[valueCol]) values1.append(dataItem1[valueCol]) for summaryFunction in SUMMARY_FUNCTIONS: result[prefix0 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values0) result[prefix1 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values1) for compTest in COMPARISON_TESTS: (t, p) = compTest(values0, values1) if np.isnan(p): p = None # Use more generic expression for NaN / null value result[compTest.__name__ + "." + valueCol] = p yield result prog.update()
"TBIL.last": [0,10], "K.last": [3,6], "Resp.last": [8,30], "Temp.last": [97,103], "Urine.last": [0,1000], "BP_Low_Diastolic.last": [40,80], "BP_High_Systolic.last": [80,160], "Glasgow.Coma.Scale.Score.last": [5,15], } dailyLabChangeRate = 0.1 baselineDNRRate = 0.1 durationRange = [1, 20] prog = ProgressDots(total=nPatients) for iPatient in range(nPatients): rowData = \ { "patient_id":iPatient, "Birth.preTimeDays": random.randint(ageRange[0], ageRange[-1]), "income": random.randrange(incomeRange[0], incomeRange[-1], incomeStep), "Female.pre": (random.random() < femaleRate)+0, "AnyVasoactive.pre": (random.random() < vasoactiveRate)+0, # Add 0 to convert boolean to 0 or 1 "AnyVentilator.pre": (random.random() < ventilatorRate)+0, "AnyCRRT.pre": (random.random() < crrtRate)+0, "Charlson.Cerebrovascular.pre": (random.random() < comorbidityRate)+0, "Charlson.CHF.pre": (random.random() < comorbidityRate)+0, "Charlson.COPD.pre": (random.random() < comorbidityRate)+0,