def parseLabResultsFile(labFile): log.info("Parse lab results file"); prog = ProgressDots(); labsByBaseNameByPatientId = dict(); # Dictionary of dictionaries of lists of result items for labResult in TabDictReader(labFile): if labResult["ord_num_value"] is not None and labResult["ord_num_value"] != NULL_STRING: patientId = int(labResult["pat_id"]); labBaseName = labResult["base_name"]; resultValue = float(labResult["ord_num_value"]); resultTime = DBUtil.parseDateValue(labResult["result_time"]); if resultValue < LAB_SENTINEL_VALUE: # Skip apparent placeholder values labResult["pat_id"] = labResult["patient_id"] = patientId; labResult["ord_num_value"] = resultValue; labResult["result_time"] = resultTime; if patientId not in labsByBaseNameByPatientId: labsByBaseNameByPatientId[patientId] = dict(); if labBaseName not in labsByBaseNameByPatientId[patientId]: labsByBaseNameByPatientId[patientId][labBaseName] = list(); labsByBaseNameByPatientId[patientId][labBaseName].append( labResult ); prog.update(); prog.printStatus(); return labsByBaseNameByPatientId;
def queryItems(self, options, outputFile): """Query for all clinical item records that fulfill the options criteria and then send the results as tab-delimited output to the outputFile. """ pauseSeconds = float(options.pauseSeconds) query = SQLQuery() query.addSelect("count(order_med_id_coded) as nOrders") query.addSelect("om.med_route, om.medication_id, om.med_description") query.addFrom("starr_datalake2018.order_med as om") if options.descriptionPrefix: query.addWhereOp("om.med_description", "like", options.descriptionPrefix + "%%") # Add wildcard to enabe prefix search if options.medRoutes: query.addWhereIn("om.med_route", options.medRoutes.split(",")) query.addGroupBy("om.medication_id, om.med_description, om.med_route") query.addOrderBy("nOrders desc, om.med_description") formatter = TextResultsFormatter(outputFile) prog = ProgressDots() for row in DBUtil.execute(query, includeColumnNames=True, connFactory=self.connFactory): formatter.formatTuple(row) time.sleep(pauseSeconds) prog.update() prog.printStatus()
def queryItems(self, options, outputFile): """Query for all clinical item records that fulfill the options criteria and then send the results as tab-delimited output to the outputFile. """ pauseSeconds = float(options.pauseSeconds) query = SQLQuery() query.addSelect( "cic.description, ci.clinical_item_id, ci.name, ci.description") query.addFrom("clinical_item_category as cic") query.addFrom("clinical_item as ci") query.addWhere( "cic.clinical_item_category_id = ci.clinical_item_category_id") if options.itemPrefix: query.addWhereOp("ci.description", "like", options.itemPrefix + "%%") # Add wildcard to enabe prefix search if options.categoryNames: query.addWhereIn("cic.description", options.categoryNames.split(",")) query.addOrderBy( "cic.description, ci.name, ci.description, ci.clinical_item_id") formatter = TextResultsFormatter(outputFile) prog = ProgressDots() for row in DBUtil.execute(query, includeColumnNames=True, connFactory=self.connFactory): formatter.formatTuple(row) time.sleep(pauseSeconds) prog.update() prog.printStatus()
def main(nResamples = 1000, sampleSize = 200, positiveCount = 180): confidenceInterval = 0.95 sample = list() for i in range(sampleSize): value = 0; if i < positiveCount: value = 1; sample.append(value); sample = np.array(sample); print("Sample Rate: ",calcStat(sample) ); rng = np.random.RandomState(nResamples); # Seed with number for consistency bootstrapValues = list(); prog = ProgressDots(); for i in range(nResamples): indices = rng.random_integers(0, sampleSize-1, sampleSize); # Get indices of resample items (with replacement) resample = sample[indices]; bootstrapValues.append( calcStat(resample) ); prog.update(); ciFractionLow = (1.0-confidenceInterval) / 2; ciFractionHigh= confidenceInterval + ciFractionLow; bootstrapValues.sort(); print("Sample Rate 95%CI Low", bootstrapValues[int(ciFractionLow*nResamples)] ) print("Sample Rate 95%CI High", bootstrapValues[int(ciFractionHigh*nResamples)] )
def addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, indexItemBaseName, labBaseNames, labPreTime, labPostTime): log.info("Sort lab results by result time for each patient and find items within specified time period to aggregate"); prog = ProgressDots(); for iPatient, (patientId, labsByBaseName) in enumerate(labsByBaseNameByPatientId.iteritems()): # Look for the first result of the index item (ferritin) indexItem = None; if indexItemBaseName in labsByBaseName: for labResult in labsByBaseName[indexItemBaseName]: if indexItem is None or labResult["result_time"] < indexItem["result_time"]: indexItem = labResult; if indexItem is not None: # Skip this patient if no index item found, should not be possible since pre-screened for relevant patients indexTime = indexItem["result_time"]; patient = patientById[patientId]; patient["index_time"] = indexTime; preTimeLimit = indexTime+labPreTime; postTimeLimit = indexTime+labPostTime; # Init values for each lab of interest to an empty list for labBaseName in labBaseNames: # Default to null for all values patient["%s.min" % labBaseName] = None; patient["%s.max" % labBaseName] = None; patient["%s.median" % labBaseName] = None; patient["%s.mean" % labBaseName] = None; patient["%s.std" % labBaseName] = None; patient["%s.first" % labBaseName] = None; patient["%s.last" % labBaseName] = None; patient["%s.proximate" % labBaseName] = None; proximateValue = None; if labBaseName in labsByBaseName: # Not all patients will have all labs checked proximateItem = None; # Item closest to the index item in time valueList = list(); for labResult in labsByBaseName[labBaseName]: resultTime = labResult["result_time"]; if preTimeLimit <= resultTime and resultTime < postTimeLimit: # Occurs within time frame of interest, so record this value valueList.append(labResult["ord_num_value"]); if proximateItem is None or (abs(resultTime-indexTime) < abs(proximateItem["result_time"]-indexTime)): proximateItem = labResult; proximateValue = proximateItem["ord_num_value"]; if len(valueList) > 0: patient["%s.min" % labBaseName] = np.min(valueList); patient["%s.max" % labBaseName] = np.max(valueList); patient["%s.median" % labBaseName] = np.median(valueList); patient["%s.mean" % labBaseName] = np.mean(valueList); patient["%s.std" % labBaseName] = np.std(valueList); patient["%s.first" % labBaseName] = valueList[0]; # Assumes previously sorted patient["%s.last" % labBaseName] = valueList[-1]; # by result_time patient["%s.proximate" % labBaseName] = proximateValue; prog.update(); colNames.extend(colsFromLabBaseNames(labBaseNames)); prog.printStatus();
def buildModel(self, corpusBOWGenerator, numTopics): """Build topic model from corpus (interpret as generator over contents) Given the bag-of-words corpus, build a docCountByWordId count dictionary to facilitate subsequent Term Frequency * Inverse DOCUMENT FREQUENCY calculations. In Clinical context, document = patient. Return (model, docCountByWordId); """ # Load dictionary to translate item IDs to descriptions itemsById = DBUtil.loadTableAsDict("clinical_item") id2word = dict() # Models expect a pair for every possible item ID, and judges vocabulary size by length of this dictionary rather than the maximum ID values. That means have to populate all of the empty ones as well. maxId = max(itemsById.keys()) for itemId in range(maxId + 1): description = str(itemId) # Default to just the same as the ID string if itemId in itemsById: description = itemsById[itemId]["description"] id2word[itemId] = description # Stream in progressive updates from corpus generator so don't have to load all into memory # Do a batch of many at a time, otherwise very slow to increment one at a time docBuffer = list() prog = ProgressDots() self.model = None self.docCountByWordId = { None: 0 } # Use None key to represent count of all documents for i, document in enumerate(corpusBOWGenerator): for ( wordId, wordCount ) in document: # Assuming uniqueness of wordId keys for each document if wordId not in self.docCountByWordId: self.docCountByWordId[wordId] = 0 self.docCountByWordId[wordId] += 1 self.docCountByWordId[None] += 1 docBuffer.append(document) if i % BUFFER_UPDATE_SIZE == ( BUFFER_UPDATE_SIZE - 1): # Update model with current buffer of documents self.model = self.updateModel(self.model, docBuffer, id2word, numTopics) docBuffer = list() # Discard committed buffer prog.update() self.model = self.updateModel(self.model, docBuffer, id2word, numTopics) # Last update for any remaining documents in buffer docBuffer = list() # Discard committed buffer # prog.printStatus(); return (self.model, self.docCountByWordId)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog -p <pid> -c <commandStr>\n" parser = OptionParser(usage=usageStr) parser.add_option( "-p", "--pid", dest="pid", help= "Process ID to monitor. As soon as it is no longer found by Unix signal (os.kill(pid,0)), will proceed to execute the given command" ) parser.add_option( "-i", "--interval", dest="interval", default="1", help= "How many seconds to wait before checking if the PID is still active. Default to 1 second." ) parser.add_option( "-c", "--commandStr", dest="commandStr", help= "Command string to execute after the specified PID is longer found. Will just pass whole string to subprocess" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() ###########---From medinfo/common/support/awaitProcess.py)---############# if options.pid and options.commandStr: pid = int(options.pid) interval = float(options.interval) prog = ProgressDots(60, 1, "intervals") while pidExists(pid): time.sleep(interval) prog.update() # prog.printStatus(); print >> sys.stderr, "Executing: ", options.commandStr process = subprocess.Popen(options.commandStr) print >> sys.stderr, "Started process: ", process.pid else: parser.print_help() sys.exit(-1) timer = time.time() - timer print >> sys.stderr, ("%.3f seconds to complete" % timer) ###---END awaitProcess method---### timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def parseClinicalItemFile(itemFile, patientIdCol="patient_id", timeCol="item_date"): prog = ProgressDots(); itemTimesByPatientId = dict(); for itemData in TabDictReader(itemFile): patientId = int(itemData[patientIdCol]); itemTime = DBUtil.parseDateValue(itemData[timeCol]); itemData[patientIdCol] = patientId; itemData[timeCol] = itemTime; if patientId not in itemTimesByPatientId: itemTimesByPatientId[patientId] = list(); itemTimesByPatientId[patientId].append( itemTime ); prog.update(); prog.printStatus(); return itemTimesByPatientId;
def main(argv): conversionProcessor = STRIDEOrderResultsConversion(); conn = DBUtil.connection(); try: # Pull out list of result names to look for that are not already in the calculated nameTable = DBUtil.execute("select name from sim_result except select base_name from order_result_stat", conn=conn); prog = ProgressDots(big=1,small=1,total=len(nameTable)); for row in nameTable: baseName = row[0]; print("Calculating Stats for %s" % baseName, file=sys.stderr); statModel = conversionProcessor.calculateResultStats( baseName, conn=conn ); DBUtil.insertRow("order_result_stat", statModel, conn=conn ); prog.update(); prog.printStatus(); conn.commit(); finally: conn.close();
def __call__(self, inputFiles): """Return generator over dictionary objects representing the concatenated contents of the input files after adding and accounting for argv parameter columns. """ # Consolidate a master set of all column headers to use self.colNames = list() # Keep track of consistent order found colSet = set() # Keep track of unique items # Pull out any header comments that may represent an argv list to parse # Pull out header row with column labels for each input file argvDicts = list() # Dictionary for each input file keyed by argv parameter name with respective value readers = list() # TabDictReader from which header columns can be accessed as fieldnames for inputFile in inputFiles: reader = TabDictReader(inputFile) readers.append(reader) for col in reader.fieldnames: if col not in colSet: colSet.add(col) self.colNames.append(col) argvDict = self.extract_argvDict(reader.commentLines) # Must be called after reader.fieldnames so initial text parsing will start argvDicts.append(argvDict) for col in argvDict.iterkeys(): if col not in colSet: colSet.add(col) self.colNames.append(col) prog = ProgressDots(50, 1, "Files") # Now generate each file in succession, but "outer-joined" to include the master column header list for argvDict, reader in zip(argvDicts, readers): for resultDict in reader: resultDict.update(argvDict) for col in self.colNames: if col not in resultDict: resultDict[col] = None yield resultDict prog.update()
def main_quickTest(argv): modelFilename = argv[1] modeler = TopicModel() timer = time.time() (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename) timer = time.time() - timer log.info("%.2f seconds to load", timer) timer = time.time() weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId( model, 100) timer = time.time() - timer log.info("%.2f seconds to generate weights", timer) for i in xrange(3): prog = ProgressDots() for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems(): for (itemId, itemWeight) in weightByItemId.iteritems(): prog.update() prog.printStatus() """
def main(argv): medIdsByActiveRx = dict() medIdsByActiveRx['Buprenorphine'] = ('125498', '114474', '212560', '114475', '114467', '114468') medIdsByActiveRx['Fentanyl Patch'] = ('2680', '27908', '125379', '27905', '27906', '540107', '540638', '540101', '27907') medIdsByActiveRx['Methadone'] = ('540483', '4953', '4951', '10546', '214468', '15996', '41938', '4954', '4952') medIdsByActiveRx['Hydrocodone'] = ('3724', '4579', '8576', '8577', '8951', '10204', '12543', '13040', '14963', '14965', '14966', '17061', '17927', '19895', '20031', '28384', '29486', '29487', '34505', '34544', '35613', '117862', '204249', '206739') medIdsByActiveRx['Hydromorphone'] = ('2458', '2459', '2464', '2465', '3757', '3758', '3759', '3760', '3761', '10224', '10225', '10226', '10227', '200439', '201094', '201096', '201098', '540125', '540179', '540666') medIdsByActiveRx['Morphine'] = ( '5167', '5168', '5172', '5173', '5176', '5177', '5178', '5179', '5180', '5183', '6977', '10655', '15852', '20908', '20909', '20910', '20914', '20915', '20919', '20920', '20921', '20922', '29464', '30138', '31413', '36140', '36141', '79691', '87820', '89282', '91497', '95244', '96810', '112562', '112564', '115335', '115336', '126132', '198543', '198544', '198623', '201842', '201848', '205011', '206731', '207949', '208896', '540182', '540300') medIdsByActiveRx['Oxycodone'] = ('5940', '5941', '6122', '6981', '10812', '10813', '10814', '14919', '16121', '16123', '16129', '16130', '19187', '26637', '26638', '27920', '27921', '27922', '27923', '28897', '28899', '28900', '31851', '31852', '31863', '31864', '92248', '126939', '200451', '203690', '203691', '203692', '203705', '203706', '203707', '204020', '204021') query = baseQuery() totalPatients = float(DBUtil.execute(query)[0][0]) # print"Total Patients\t%s" % totalPatients # print"======= Dx Groups ==========="; # print"Dx Group\tPt Count\tDx Rate"; patientsPerDxGroup = dict() query = SQLQuery() query.addSelect("count(distinct prob.pat_id) as ptCount") query.addSelect("prob.%s" % DX_COL) query.addFrom("stride_problem_list as prob") query.addWhereOp("prob.noted_date", "<", LIMIT_DATE) query.addGroupBy("prob.%s" % DX_COL) if DOUBLE_DX: query.addSelect("prob2.%s" % DX_COL) query.addFrom("stride_problem_list as prob2") query.addWhere("prob.pat_id = prob2.pat_id") query.addWhereOp("prob2.noted_date", "<", LIMIT_DATE) query.addGroupBy("prob2.%s" % DX_COL) results = DBUtil.execute(query) for row in results: patientCount = row[0] dxGroup = row[1] if DOUBLE_DX: dxGroup = (dxGroup, row[2]) # Composite tuple including second diagnosis patientsPerDxGroup[dxGroup] = patientCount progress = ProgressDots() for activeRx, medIds in medIdsByActiveRx.iteritems(): query = baseQuery() query.addWhereIn("medication_id", medIds) # Baseline prescription rates rxPtCount = DBUtil.execute(query)[0][0] # print"====== Rx Counts ======"; # print"Rx\tPt Count\tRx Rate"; # print"%s\t%s\t%s" % (activeRx, rxPtCount, (rxPtCount/totalPatients)); # print"======== Rx-Dx Association ========"; statIds = ( "P-Fisher", "P-YatesChi2", "oddsRatio", "relativeRisk", "interest", "LR+", "LR-", "sensitivity", "specificity", "PPV", "NPV", ) if progress.getCounts() == 0: headerCols = [ "Rx", "Dx", "RxDxCount", "RxCount", "DxCount", "Total" ] if DOUBLE_DX: headerCols.insert(2, "Dx2") headerCols.extend(statIds) headerStr = str.join("\t", headerCols) print headerStr # Query out per diagnosis group, but do as aggregate grouped query query.addSelect("prob.%s" % DX_COL) query.addFrom("stride_problem_list as prob") query.addWhere("med.pat_id = prob.pat_id") query.addWhereOp("prob.noted_date", "<", LIMIT_DATE) #query.addWhereIn("prob.%s" % DX_COL, dxKeys ); query.addGroupBy("prob.%s" % DX_COL) if DOUBLE_DX: query.addSelect("prob2.%s" % DX_COL) query.addFrom("stride_problem_list as prob2") query.addWhere("prob.pat_id = prob2.pat_id") query.addWhereOp("prob2.noted_date", "<", LIMIT_DATE) query.addGroupBy("prob2.%s" % DX_COL) results = DBUtil.execute(query) for row in results: rxDxPtCount = row[0] dxGroup = row[1] if DOUBLE_DX: dxGroup = (dxGroup, row[2]) # Composite tuple including second diagnosis dxPtCount = patientsPerDxGroup[dxGroup] conStats = ContingencyStats(rxDxPtCount, rxPtCount, dxPtCount, totalPatients) dataCells = [ activeRx, dxGroup, rxDxPtCount, rxPtCount, dxPtCount, totalPatients ] if DOUBLE_DX: dataCells[1] = dxGroup[0] dataCells.insert(2, dxGroup[1]) for statId in statIds: try: dataCells.append(conStats[statId]) except ZeroDivisionError: dataCells.append(None) for i, value in enumerate(dataCells): dataCells[i] = str(value) # String conversion to allow for concatenation below dataStr = str.join("\t", dataCells) print dataStr progress.update() progress.printStatus()
def queryPatientEpisodes(outputFile, extractor): log.info("Select patient admissions with possible/probable sepsis within 24 hours of admission (long query >60 min?)..."); conn = DBUtil.connection(); cursor = conn.cursor(); try: # Clinical item category for admission diagnoses # ADMIT_DX_CATEGORY_ID = 2; admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0]; # Look for items indicating suspected infection / sepsis ivAntibioticItemIds = loadIVAntibioticItemIds(extractor); bloodCultureItemIds = loadBloodCultureItemIds(extractor); respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor); # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds)); suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]); # Convert to comma-separated string via a str.join function on list contracture # Look for primary surgery teams to exclude excludeTeamCategory = "SurgerySpecialty"; excludeTreatmentTeams = list(); for row in extractor.loadMapData("TreatmentTeamGroups"): if row["team_category"] == excludeTeamCategory: excludeTreatmentTeams.append(row["treatment_team"]); query = SQLQuery(); query.addSelect("clinical_item_id"); query.addFrom("clinical_item"); query.addWhereIn("description", excludeTreatmentTeams ); excludeTeamItemIds = set(); for row in DBUtil.execute(query, conn=conn): excludeTeamItemIds.add(row[0]); excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]); # Convert to comma-separated string via a str.join function on list contracture # First pass query to get the list of patients and emergency department presentation times cohortQuery = \ """ -- Pick out date(s) when admitted through emergency department and matching discharge time select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime from stride_adt as adt1, stride_adt as adt2 where -- Admission event adt1.department_in = 'EMERGENCY DEPARTMENT' and adt1.event_in = 'Admission' and adt1.pat_anon_id in ( -- Select any patient with any suspected sepsis related order (i.e., IV antibiotics or blood cultures) select patient_id from patient_item as pi where pi.clinical_item_id in (%s) except -- Exclude any patient who has been on a primary surgery team select patient_id from patient_item where clinical_item_id in (%s) -- -12434586418575,-12432455207729,-12428492282572,-12428492282572,-12424048595257,-12414081679705 ) and adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id and -- Discharge event adt2.event_out = 'Discharge' order by adt1.shifted_transf_in_dt_tm """ % (suspectSepsisItemIdsStr, excludeTeamItemIdsStr); print >> sys.stderr, cohortQuery; cursor.execute(cohortQuery); patientEpisodes = list(); patientEpisodeById = dict(); # Collect Build basic patient ID and # ED presentation dates and Discharge date/time prog = ProgressDots(); row = cursor.fetchone(); while row is not None: (patientId, encounterId, edAdmitTime, dischargeTime) = row; #patientId = int(patientId); patientEpisode = \ RowItemModel \ ( { "patient_id":patientId, "edAdmitTime":edAdmitTime, "dischargeTime":dischargeTime, "encounter_id":encounterId, "payorTitle": None, # Default encounter data to null in case can't find it later "bpSystolic": None, "bpDiastolic": None, "temperature": None, "pulse": None, "respirations": None, } ); patientEpisodes.append(patientEpisode); if patientEpisode["encounter_id"] not in patientEpisodeById: patientEpisodeById[patientEpisode["encounter_id"]] = patientEpisode; prog.update(); row = cursor.fetchone(); prog.printStatus(); # Second query phase to link to encounter information (e.g., insurance, admitting vital signs) encounterIds = columnFromModelList(patientEpisodes, "encounter_id"); query = SQLQuery(); query.addSelect("pat_id"); query.addSelect("pat_enc_csn_id"); query.addSelect("title"); query.addSelect("bp_systolic"); query.addSelect("bp_diastolic"); query.addSelect("temperature"); query.addSelect("pulse"); query.addSelect("respirations"); query.addFrom("stride_patient_encounter"); query.addWhereIn("pat_enc_csn_id", encounterIds); cursor.execute(str(query), query.params); row = cursor.fetchone(); while row is not None: (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic, temperature, pulse, respirations) = row; if encounterId in patientEpisodeById: patientEpisode = patientEpisodeById[encounterId]; if patientEpisode["payorTitle"] is None: patientEpisode["payorTitle"] = set(); # Single encounters may have multiple payors to track patientEpisode["payorTitle"].add(payorTitle); patientEpisode["bpSystolic"] = bpSystolic; patientEpisode["bpDiastolic"] = bpDiastolic; patientEpisode["temperature"] = temperature; patientEpisode["pulse"] = pulse; patientEpisode["respirations"] = respirations; row = cursor.fetchone(); # Drop results as tab-delimited text output formatter = TextResultsFormatter(outputFile); formatter.formatResultDicts(patientEpisodes, addHeaderRow=True); return patientEpisodes; finally: cursor.close(); conn.close();
def __call__(self, inputFile, labelCols, valueCols, matchCols, baseLabels=None): prog = ProgressDots() self.labelCols = labelCols self.valueCols = valueCols self.matchCols = matchCols self.baseLabels = baseLabels labelModelByLabelKey = dict() dataByLabelKey = dict() reader = TabDictReader(inputFile) for rowModel in reader: labelKey = list() labelModel = dict() for labelCol in self.labelCols: labelModel[labelCol] = rowModel[labelCol] labelKey.append(rowModel[labelCol]) labelKey = tuple(labelKey) # Change to immutable object that can be hashed # Copy just items of interest valueModel = {} if self.matchCols: for matchCol in self.matchCols: valueModel[matchCol] = rowModel[matchCol] for valueCol in self.valueCols: try: valueModel[valueCol] = float(rowModel[valueCol]) except ValueError: # Maybe None string, could not parse into a number valueModel[valueCol] = None if labelKey not in dataByLabelKey: labelModelByLabelKey[labelKey] = labelModel dataByLabelKey[labelKey] = list() dataByLabelKey[labelKey].append(valueModel) prog.update() # prog.printStatus(); # Another pass to ensure data is consistently sorted in each group to allow later paired t-tests if self.matchCols: for labelKey, data in dataByLabelKey.iteritems(): data.sort(RowItemFieldComparator(self.matchCols)) # See if looking for only one set of base labeled data to compare the rest against baseLabelKey = None if self.baseLabels is not None: baseLabelKey = tuple(self.baseLabels) # Result pass to compare all group pair-wise combinations prog = ProgressDots() for labelKey0, data0 in dataByLabelKey.iteritems(): prefix0 = "Group0." labelModel0 = labelModelByLabelKey[labelKey0] if baseLabelKey is not None and labelKey0 != baseLabelKey: continue # Skip entries where the base label does not match specified key for labelKey1, data1 in dataByLabelKey.iteritems(): prefix1 = "Group1." labelModel1 = labelModelByLabelKey[labelKey1] result = dict() for labelCol in self.labelCols: result[prefix0 + labelCol] = labelModel0[labelCol] result[prefix1 + labelCol] = labelModel1[labelCol] for valueCol in self.valueCols: # Pull out value column for each data group. Previous, sort by match col to allow paired t-testing # Skip any value pairs if non-numeric / None value values0 = list() values1 = list() for dataItem0, dataItem1 in zip(data0, data1): if dataItem0[valueCol] is not None and dataItem1[ valueCol] is not None: values0.append(dataItem0[valueCol]) values1.append(dataItem1[valueCol]) for summaryFunction in SUMMARY_FUNCTIONS: result[prefix0 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values0) result[prefix1 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values1) for compTest in COMPARISON_TESTS: (t, p) = compTest(values0, values1) if np.isnan(p): p = None # Use more generic expression for NaN / null value result[compTest.__name__ + "." + valueCol] = p yield result prog.update()
for iDay in range(1, lengthOfStay): if (random.random() < dailyDNRRate): dnrDay = iDay dnrOccurs = True break # Don't need to keep looking rowData["AnyDNRatEnd"] = dnrOccurs + 0 # Generate daily data for iDay in range(lengthOfStay): rowData["curr_day"] = rowData["start"] = iDay rowData["end"] = iDay + 1 rowData["timeUntilNoMoreData"] = lengthOfStay - rowData["start"] rowData["timeUntilNoDataOrDNR"] = rowData["timeUntilNoMoreData"] if dnrOccurs: rowData["AnyDNR.pre"] = (iDay >= dnrDay) + 0 rowData["AnyDNR.within1day"] = (iDay + 1 == dnrDay) + 0 rowData["AnyDNR.postTimeDays"] = dnrDay - iDay if rowData["AnyDNR.postTimeDays"] < 0: rowData["AnyDNR.postTimeDays"] = "NA" else: rowData["timeUntilNoDataOrDNR"] = min( rowData["timeUntilNoMoreData"], rowData["AnyDNR.postTimeDays"]) else: rowData["AnyDNR.pre"] = 0 rowData["AnyDNR.within1day"] = 0 rowData["AnyDNR.postTimeDays"] = "NA" formatter.formatResultDict(rowData, colNames) prog.update() ofs.close()
def queryPatientEpisodes(outputFile, extractor): log.info( "Select patient admissions with provider category of Tt Pamf Med (Primary) or Tt Med Univ (Primary)" ) conn = DBUtil.connection() cursor = conn.cursor() try: # # Clinical item category for admission diagnoses # # ADMIT_DX_CATEGORY_ID = 2; # admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0]; # # Look for items indicating suspected infection / sepsis # ivAntibioticItemIds = loadIVAntibioticItemIds(extractor); # bloodCultureItemIds = loadBloodCultureItemIds(extractor); # respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor); # # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected # suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds)); # suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]); # Convert to comma-separated string via a str.join function on list contracture # # Look for primary surgery teams to exclude # excludeTeamCategory = "SurgerySpecialty"; # excludeTreatmentTeams = list(); # for row in extractor.loadMapData("TreatmentTeamGroups"): # if row["team_category"] == excludeTeamCategory: # excludeTreatmentTeams.append(row["treatment_team"]); # query = SQLQuery(); # query.addSelect("clinical_item_id"); # query.addFrom("clinical_item"); # query.addWhereIn("description", excludeTreatmentTeams ); # excludeTeamItemIds = set(); # for row in DBUtil.execute(query, conn=conn): # excludeTeamItemIds.add(row[0]); # excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]); # Convert to comma-separated string via a str.join function on list contracture # First pass query to get the list of patients and emergency department presentation times cohortQuery = \ """ select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime from stride_adt as adt1, stride_adt as adt2 where adt1.pat_anon_id in (select patient_id from patient_item inner join clinical_item on patient_item.clinical_item_id = clinical_item.clinical_item_id where clinical_item.clinical_item_category_id = 161 AND clinical_item.description = '%s') and adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id """ % ("Tt Pamf Med (Primary)") print(cohortQuery, file=sys.stderr) cursor.execute(cohortQuery) patientEpisodes = list() patientEpisodeById = dict() # Collect Build basic patient ID and # ED presentation dates and Discharge date/time prog = ProgressDots() row = cursor.fetchone() while row is not None: (patientId, encounterId, edAdmitTime, dischargeTime) = row #patientId = int(patientId); patientEpisode = \ RowItemModel \ ( { "patient_id":patientId, "edAdmitTime":edAdmitTime, "dischargeTime":dischargeTime, "encounter_id":encounterId, "payorTitle": None, # Default encounter data to null in case can't find it later "bpSystolic": None, "bpDiastolic": None, "temperature": None, "pulse": None, "respirations": None, } ) patientEpisodes.append(patientEpisode) if patientEpisode["encounter_id"] not in patientEpisodeById: patientEpisodeById[ patientEpisode["encounter_id"]] = patientEpisode prog.update() row = cursor.fetchone() prog.printStatus() # Second query phase to link to encounter information (e.g., insurance, admitting vital signs) encounterIds = columnFromModelList(patientEpisodes, "encounter_id") query = SQLQuery() query.addSelect("pat_id") query.addSelect("pat_enc_csn_id") query.addSelect("title") query.addSelect("bp_systolic") query.addSelect("bp_diastolic") query.addSelect("temperature") query.addSelect("pulse") query.addSelect("respirations") query.addFrom("stride_patient_encounter") query.addWhereIn("pat_enc_csn_id", encounterIds) cursor.execute(str(query), query.params) row = cursor.fetchone() while row is not None: (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic, temperature, pulse, respirations) = row if encounterId in patientEpisodeById: patientEpisode = patientEpisodeById[encounterId] if patientEpisode["payorTitle"] is None: patientEpisode["payorTitle"] = set() # Single encounters may have multiple payors to track patientEpisode["payorTitle"].add(payorTitle) patientEpisode["bpSystolic"] = bpSystolic patientEpisode["bpDiastolic"] = bpDiastolic patientEpisode["temperature"] = temperature patientEpisode["pulse"] = pulse patientEpisode["respirations"] = respirations row = cursor.fetchone() # Drop results as tab-delimited text output formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(patientEpisodes, addHeaderRow=True) return patientEpisodes finally: cursor.close() conn.close()
def commitUpdateBuffer(self, updateBuffer, linkedItemIdsByBaseId, conn=None): """Take data accumulated in updateBuffer from prior update methods and commit them as incremental changes to the database. Clear buffer thereafter. """ extConn = conn is not None; if not extConn: conn = self.connFactory.connection(); try: if "incrementDataByItemIdPair" in updateBuffer: # Ensure baseline records exist to facilitate subsequent incremental update queries itemIdPairs = list(updateBuffer["incrementDataByItemIdPair"].keys()); self.prepareItemAssociations(itemIdPairs, linkedItemIdsByBaseId, conn); # Construct incremental update queries based on each item pair's incremental counts/sums nItemPairs = len(itemIdPairs); log.debug("Primary increment updates for %d item pairs" % nItemPairs ); incrementProg = ProgressDots(name="Increments"); incrementProg.total = nItemPairs; cursor = conn.cursor(); try: for (itemIdPair, incrementData) in updateBuffer["incrementDataByItemIdPair"].items(): query = ["UPDATE clinical_item_association SET"]; for col, increment in incrementData.items(): query.append("%(col)s=%(col)s+%(increment)s" % {"col":col,"increment":increment}); query.append(","); query.pop(); # Drop extra comma at end of list query.append("WHERE clinical_item_id=%(p)s AND subsequent_item_id=%(p)s" % {"p":DBUtil.SQL_PLACEHOLDER} ); query = str.join(" ", query); itemIdPair = eval(itemIdPair) cursor.execute(query, itemIdPair); incrementProg.update(); # incrementProg.printStatus(); finally: cursor.close(); if "analyzedPatientItemIds" in updateBuffer: # Record analysis date for the given patient items patientItemIdSet = updateBuffer["analyzedPatientItemIds"]; nItems = len(patientItemIdSet); log.debug("Record %d analyzed items" % nItems ); if nItems > 0: paramList = [datetime.now()]; updateSize = 0; for itemId in patientItemIdSet: paramList.append(itemId); updateSize += 1; if self.itemsPerUpdate is not None and updateSize > self.itemsPerUpdate: # Update what we have so far to avoid excessive single mass query that may overwhelm database timeout DBUtil.execute \ ( """update patient_item set analyze_date = %(p)s where patient_item_id in (%(pList)s) and analyze_date is null """ % {"p": DBUtil.SQL_PLACEHOLDER, "pList":generatePlaceholders(updateSize)}, tuple(paramList), conn=conn ); # Reset item list parameters paramList = [datetime.now()]; updateSize = 0; # Final Update DBUtil.execute \ ( """update patient_item set analyze_date = %(p)s where patient_item_id in (%(pList)s) and analyze_date is null """ % {"p": DBUtil.SQL_PLACEHOLDER, "pList":generatePlaceholders(updateSize)}, tuple(paramList), conn=conn ); # Flag that any cached association metrics will be out of date self.dataManager.clearCacheData("analyzedPatientCount"); self.dataManager.clearCacheData("clinicalItemCountsUpdated"); # Database commit conn.commit(); # Wipe out buffer to reflect incremental changes done, so any new ones should be recorded fresh updateBuffer.clear(); updateBuffer["nAssociations"] = 0; finally: if not extConn: conn.close();
# Select the second (index one) form (the first form is a search query box) #br.select_form(nr=1) # User credentials, single login and then store a cookie #br.form['username'] = '******' #br.form['password'] = '******' # Login #br.submit() # Base URL to query for, with parameters for subsets BASE_URL = 'http://www.samhsa.gov/medication-assisted-treatment/physician-program-data/treatment-physician-locator?field_bup_physician_us_state_value=All&page=%s' BASE_OUTPUT_FILENAME = 'buprenorphinePhysicians.%s.htm'; N_PAGES = 641 #N_PAGES = 10 progress = ProgressDots(big=100,small=2); for iPage in xrange(N_PAGES): sourceURL = BASE_URL % (iPage); sourceFile = br.open(sourceURL); sourceContent = sourceFile.read(); # Just store whole file in memory for simplicity sourceFile.close(); localFilename = BASE_OUTPUT_FILENAME % (iPage); localFile = open(localFilename,"w"); localFile.write(sourceContent); localFile.close(); progress.update(); progress.printStatus();
]; for categoryName in CATEGORY_LIST: for itemPrefix in ITEM_PREFIXES: specificArgv = ["-c", categoryName, "-i", itemPrefix, "results/queryResults.%s.%s.tab.gz" % (itemPrefix, categoryName) ]; specificArgvList.append(specificArgv); prog = ProgressDots(1,1,"Processes",total=len(specificArgvList)); for specificArgv in specificArgvList: key = "%.3d.%s" % (prog.getCounts(), str.join("_",specificArgv) ); key = key.replace("/",".."); # Don't want to use directory separator in temp log file name argv = list(baseArgv) argv.extend(specificArgv); log.info( "Starting: "+str.join(" ", argv) ); logFile = stdOpen(LOG_FILE_TEMPLATE % key,"w") # Blocking sub-process call if want serial processes. #subprocess.call(argv, stderr=logFile); # Non-blocking subprocess.Popen to spawn parallel processes process = subprocess.Popen(argv, stderr=logFile); log.info("Process ID: %s" % process.pid); # Print command lines to effectively generate a .sh script #print "nohup", #print str.join(" ", argv), #print "&>", LOG_FILE_TEMPLATE % key,"&" prog.update(); prog.printStatus();
def __call__(self, sourceFile, outputFile): headers = sourceFile.readline().split() for i, header in enumerate(headers): headers[i] = header.lower() prog = ProgressDots(50, 1) summaryRecords = list() headerData = {"script": HTML_SCRIPT, "style": HTML_STYLE} print(HTML_START % headerData, file=outputFile) nextRecord = None lastLine = None print('''<table class="dataTable" cellspacing=0 cellpadding=4>''', file=outputFile) iRecord = 0 for line in sourceFile: if self.isNewRecordLine(line): # Blank line should signal end of a record if nextRecord is not None: # Process the prior record if iRecord % self.sampleInterval == 0: self.processRecord(nextRecord, iRecord) self.outputRecordDetail(nextRecord, outputFile) summaryRecords.append( self.extractSummaryRecord(nextRecord)) # Add just summary information to display later, without keeping whole text file in memory. This can still 2GB RAM per 1000 records, so may be better to route to temporary file instead iRecord += 1 prog.update() # Prepare a new record nextRecord = dict() chunks = line.split(self.delim) numChunks = len(chunks) for i, header in enumerate(headers): if i < numChunks: nextRecord[header] = chunks[i] else: # Not enough values. Probably because continues to next line. Record blank string nextRecord[header] = "" #nextRecord["contact_date"] = DBUtil.parseDateValue(nextRecord["contact_date"]); else: # documentHeader in nextRecord # Continuing lines of text nextRecord[self.documentHeader] += line # Process the last record if iRecord % self.sampleInterval == 0: self.processRecord(nextRecord, iRecord) self.outputRecordDetail(nextRecord, outputFile) summaryRecords.append(self.extractSummaryRecord(nextRecord)) # Add just summary information to display later, without keeping whole text file in memory iRecord += 1 prog.update() # prog.printStatus(); print('''</table>''', file=outputFile) self.outputSummaryRecords(summaryRecords, outputFile) print(HTML_END, file=outputFile) #for header in headerSet: # print >> sys.stdout, header; return summaryRecords
"""Helped script for parallel process debugging. Just takes one parameter, sleeps for that many seconds, then completes""" import sys from medinfo.common.Util import ProgressDots import time duration = int(sys.argv[1]) progress = ProgressDots() for i in range(duration): progress.update() progress.printStatus()
def main(argv): # Initial pass to get single diagnosis baselines by looking for 2x Dx combos where Dx1 = Dx2 countByDx = dict() countByRxDx = dict() associationFile = open(argv[1]) associationFile.readline() # Dump header row for line in associationFile: line.strip() chunks = line.split("\t") rx = chunks[0] dx1 = chunks[1] dx2 = chunks[2] if dx1 == dx2: rxDxCount = int(chunks[3]) rxCount = int(chunks[4]) dxCount = int(chunks[5]) countByRxDx[(rx, dx1)] = rxDxCount countByDx[dx1] = dxCount # Second pass to now do stats for combo diagnoses, to see if prescription shows difference between 1 or both diagnoses statIds = ( "P-Fisher", "P-YatesChi2", "oddsRatio", "relativeRisk", "interest", "LR+", "LR-", "sensitivity", "specificity", "PPV", "NPV", ) headerCols = [ "Rx", "Dx1", "Dx2", "RxDx1Dx2Count", "RxDx1Count", "RxDx2Count", "RxCount", "Dx1Dx2Count", "Total", "E(RxDx1Dx2Count)", "E(RxDx2Dx1Count)", "E(Dx1Dx2Count)", "P-Chi2-Obs:Exp", ] headerCols.extend(statIds) headerStr = str.join("\t", headerCols) print(headerStr) associationFile = open(argv[1]) associationFile.readline() # Dump header row progress = ProgressDots() for line in associationFile: line.strip() chunks = line.split("\t") rx = chunks[0] dx1 = chunks[1] dx2 = chunks[2] rxDx1Dx2Count = int(chunks[3]) rxDx1Count = countByRxDx[(rx, dx1)] rxDx2Count = countByRxDx[(rx, dx2)] rxCount = int(chunks[4]) dx1Count = countByDx[dx1] dx2Count = countByDx[dx2] dx1dx2Count = int(chunks[5]) totalCount = float(chunks[6]) # Floating point to auto-convert float divisions later conStats = ContingencyStats(rxDx1Dx2Count, rxCount, dx1dx2Count, totalCount) # Expected vs. observed Rx rates dependent on presence of Dx1Dx2 combination based one whether Rx rates per diagnosis are independent of combination observed = array \ ([ (rxDx1Dx2Count), (dx1dx2Count-rxDx1Dx2Count), (rxCount-rxDx1Dx2Count), (totalCount-dx1dx2Count-rxCount+rxDx1Dx2Count), ]) # Expected rates based on assumption that diagnoses occur independently of one another expectedRxDx1Dx2 = (rxDx1Count * dx2Count / totalCount) expectedRxDx2Dx1 = (rxDx2Count * dx1Count / totalCount) expectedDx1Dx2 = (dx1Count * dx2Count / totalCount) expected = array \ ([ (expectedRxDx1Dx2), (expectedDx1Dx2-expectedRxDx1Dx2), (rxCount-expectedRxDx1Dx2), (totalCount-expectedDx1Dx2-rxCount+expectedRxDx1Dx2), ]) (chi2ObsExp, pChi2ObsExp) = chisquare(observed, expected) dataCells = [ rx, dx1, dx2, rxDx1Dx2Count, rxDx1Count, rxDx2Count, rxCount, dx1dx2Count, totalCount, expectedRxDx1Dx2, expectedRxDx2Dx1, expectedDx1Dx2, pChi2ObsExp ] for statId in statIds: try: dataCells.append(conStats[statId]) except ZeroDivisionError: dataCells.append(None) for i, value in enumerate(dataCells): dataCells[i] = str(value) # String conversion to allow for concatenation below dataStr = str.join("\t", dataCells) print(dataStr) progress.update() progress.printStatus()