def queryItems(self, options, outputFile): """Query for all clinical item records that fulfill the options criteria and then send the results as tab-delimited output to the outputFile. """ pauseSeconds = float(options.pauseSeconds) query = SQLQuery() query.addSelect("count(order_med_id_coded) as nOrders") query.addSelect("om.med_route, om.medication_id, om.med_description") query.addFrom("starr_datalake2018.order_med as om") if options.descriptionPrefix: query.addWhereOp("om.med_description", "like", options.descriptionPrefix + "%%") # Add wildcard to enabe prefix search if options.medRoutes: query.addWhereIn("om.med_route", options.medRoutes.split(",")) query.addGroupBy("om.medication_id, om.med_description, om.med_route") query.addOrderBy("nOrders desc, om.med_description") formatter = TextResultsFormatter(outputFile) prog = ProgressDots() for row in DBUtil.execute(query, includeColumnNames=True, connFactory=self.connFactory): formatter.formatTuple(row) time.sleep(pauseSeconds) prog.update() prog.printStatus()
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog <inputFile> <outputFile>\n"+\ " <inputFile> Tab-delimited input file taken from schedule Excel file. Example data format as seen in test case examples. See support/extractExcelSheets.py for help on pulling out Excel sheets into tab-delimited data files.\n"+\ " <outputFile> File to output results to. Designate '-' for stdout."; parser = OptionParser(usage=usageStr) parser.add_option("-i", "--providerIdFilename", dest="providerIdFilename", help="Name of provider ID CSV file. If provided, then add column for prov_id based on resident first_name and last_name, match within first "+DEFAULT_INDEX_PREFIX_LENGTH+" characters, or generate ID value if no match found"); parser.add_option("-y", "--baseYear", dest="baseYear", help="Year expect dates to start in."); parser.add_option("-t", "--changeTime", dest="changeTime", default=CHANGE_TIME, help="Hour of day that count as delimiter between rotations. Likely should NOT be midnight = 0, because night shifts span midnight. Default to 7 = 7am."); (options, args) = parser.parse_args(argv[1:]) if len(args) >= 2 and options.baseYear: log.info("Starting: "+str.join(" ", argv)) timer = time.time(); baseYear = int(options.baseYear); if options.providerIdFilename is not None: providerReader = csv.DictReader(open(options.providerIdFilename)); self.loadProviderModels( providerReader ); inFile = stdOpen(args[0]); scheduleItems = self.parseScheduleItems(inFile, baseYear); outFile = stdOpen(args[1],"w"); formatter = TextResultsFormatter(outFile); formatter.formatResultDicts(scheduleItems); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def queryItems(self, options, outputFile): """Query for all clinical item records that fulfill the options criteria and then send the results as tab-delimited output to the outputFile. """ pauseSeconds = float(options.pauseSeconds) query = SQLQuery() query.addSelect( "cic.description, ci.clinical_item_id, ci.name, ci.description") query.addFrom("clinical_item_category as cic") query.addFrom("clinical_item as ci") query.addWhere( "cic.clinical_item_category_id = ci.clinical_item_category_id") if options.itemPrefix: query.addWhereOp("ci.description", "like", options.itemPrefix + "%%") # Add wildcard to enabe prefix search if options.categoryNames: query.addWhereIn("cic.description", options.categoryNames.split(",")) query.addOrderBy( "cic.description, ci.name, ci.description, ci.clinical_item_id") formatter = TextResultsFormatter(outputFile) prog = ProgressDots() for row in DBUtil.execute(query, includeColumnNames=True, connFactory=self.connFactory): formatter.formatTuple(row) time.sleep(pauseSeconds) prog.update() prog.printStatus()
def main(argv=None): timer = time.time() # Final columns to output to patient matrix colNames = list() patientById = parsePatientFile(stdOpen("patients.tab"), colNames) labsByBaseNameByPatientId = parseLabResultsFile(stdOpen("labs.tab")) addLabFeatures(labsByBaseNameByPatientId, patientById, colNames, INDEX_ITEM_BASE_NAME, LAB_BASE_NAMES, LAB_PRE_TIME, LAB_POST_TIME) log.info( "Record presence of items in terms of relative time to each item from index time" ) itemTimesByPatientId = parseClinicalItemFile(stdOpen("admitDx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ICD9.208-AdmitDx") itemTimesByPatientId = parseClinicalItemFile(stdOpen("problemListDx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ICD9.208-ProblemListDx") itemTimesByPatientId = parseClinicalItemFile(stdOpen("feSO4Rx.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironSO4") itemTimesByPatientId = parseClinicalItemFile(stdOpen("allEnteralIron.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironEnteral") itemTimesByPatientId = parseClinicalItemFile(stdOpen("ironIV.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironIV") itemTimesByPatientId = parseClinicalItemFile( stdOpen("outpatientIronRx.tab"), patientIdCol="pat_id", timeCol="ordering_date") addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "ironOutpatient") itemTimesByPatientId = parseClinicalItemFile(stdOpen("transfusions.tab")) addClinicalItemFeatures(itemTimesByPatientId, patientById, colNames, "RBCTransfusion") patientResults = filterPatients(patientById) log.info("Output feature matrix file with row per patient") featureMatrixFile = stdOpen("featureMatrix.lab14to1day.tab", "w") formatter = TextResultsFormatter(featureMatrixFile) formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True) timer = time.time() - timer print("%.3f seconds to complete" % timer, file=sys.stderr)
def action_default(self): # Read checkboxes by presence or absence of field self.requestData[ "incCols"] = "" # Checkboxes not passed if unchecked, so extra step to ensure uncheck is persisted incCols = False if self.mForm.has_key("incCols"): self.requestData["incCols"] = self.mForm["incCols"].value incCols = True # Point to the specified database connFactory = self.connectionFactory() timer = time.time() # Just execute a normal query, possibly with a result set results = DBUtil.execute(self.mForm["input"].value, includeColumnNames=incCols, connFactory=connFactory) if type(results) == list: # Result set, format as table formatter = TextResultsFormatter(StringIO()) formatter.formatResultSet(results) self.requestData["resultsText"] = formatter.getOutFile().getvalue() headerRowFormat = None if incCols: headerRowFormat = "th" formatter = HtmlResultsFormatter(StringIO(), headerRowFormat) formatter.formatResultSet(results) self.requestData["resultsHtml"] = formatter.getOutFile().getvalue() self.requestData["resultsInfo"] = "(%d rows) " % len(results) else: self.requestData[ "resultsText"] = "%d rows affected (or other return code)" % results timer = time.time() - timer self.requestData["resultsInfo"] += "(%1.3f seconds)" % timer
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <trainFile> <testFile> [<outputFile>]\n"+\ " <trainFile> Tab-delimited file, queryItemIdsJSON expected to be parseable into lists of query items as well as an outcome.X column\n"+\ " <testFile> Same structure as trainFile, but with test cases to assess prediction scoring\n"+\ " <outputFile> Tab-delimited that can be used for ROC analysis with columns for outcome and predicted score\n"+\ "" parser = OptionParser(usage=usageStr) parser.add_option( "-o", "--outcomeItemId", dest="outcomeItemId", help="Outcome item IDs to assess get prediction scores for") (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 2: trainFile = stdOpen(args[0]) testFile = stdOpen(args[1]) outcomeId = int(options.outcomeItemId) # Run the actual analysis (featureMatrix, outcomeMatrix, queryIds, rowModels) = self.fileToMatrixes(trainFile, outcomeId) model = self.train(featureMatrix, outcomeMatrix) analysisResults = self.predict(testFile, model, queryIds, outcomeId) # Format the results for output outputFilename = None if len(args) > 2: outputFilename = args[2] outputFile = stdOpen(outputFilename, "w") # Print comment line with arguments to allow for deconstruction later as well as extra results print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile) colNames = self.analysisHeaders(outcomeId) analysisResults.insert(0, RowItemModel(colNames, colNames)) # Insert a mock record to get a header / label row formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def queryPatients(outputFile): log.info("Select patients with any result for a ferritin test") patientById = dict() query = \ """select distinct pat_id from stride_order_results as sor, stride_order_proc as sop where sor.order_proc_id = sop.order_proc_id and base_name = 'ferritin' """ results = DBUtil.execute(query) for (patientId, ) in results: patientId = int(patientId) patientById[patientId] = RowItemModel({"patient_id": patientId}) log.info("Patients with admit or diet orders for surgery") # Not perfectly accurate for isolating surgical patients for patient in patientById.itervalues(): patient["surgery"] = 0 # Default to 0 / false query = \ """select distinct patient_id from patient_item where clinical_item_id in (3614,4177,4220) """ results = DBUtil.execute(query) for (patientId, ) in results: if patientId in patientById: patientById[patientId]["surgery"] = 1 log.info("Patients with an order for dialysis") # (Does not differentiate acute vs. chronic. Includes peritoneal) for patient in patientById.itervalues(): patient["dialysis"] = 0 # Default to 0 / false query = \ """select distinct patient_id from patient_item where clinical_item_id in (1815,3783,4322) """ results = DBUtil.execute(query) for (patientId, ) in results: if patientId in patientById: patientById[patientId]["dialysis"] = 1 # Drop results as tab-delimited text output formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(patientById.itervalues(), addHeaderRow=True) return patientById
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile1> <inputFile2> ... <inputFileN>\n"+\ " <inputFileX> Tab-delimited file of data. Initial comment lines will be scanned for list of argv parameters to add as data columns.\n"+\ " If only a single input is given, interpret this as an index file which lists the names of the other files to concatenate (e.g., obtained with dir * /b or ls).\n" parser = OptionParser(usage=usageStr) parser.add_option( "-o", "--outputFile", dest="outputFile", help= "Tab-delimited file matching concatenated contents of input files. Specify \"-\" to send to stdout." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 0: inputFiles = list() if len(args) > 1: for inputFilename in args: inputFiles.append(stdOpen(inputFilename)) else: # len(argvs) == 1, Single index file rather than list of all files on command-line indexFile = stdOpen(args[0]) for line in indexFile: inputFilename = line.strip() inputFiles.append(stdOpen(inputFilename)) # Format the results for output outputFile = stdOpen(options.outputFile, "w") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print >> outputFile, COMMENT_TAG, json.dumps(summaryData) # Tab-delimited output formatting formatter = TextResultsFormatter(outputFile) # Begin the file parsing so can at least get the total list of column headers rowGenerator = self(inputFiles) firstRow = rowGenerator.next() # Insert a mock record to get a header / label row colNames = self.resultHeaders() formatter.formatTuple(colNames) # Stream the concatenated data rows to the output to avoid storing all in memory formatter.formatResultDict(firstRow, colNames) for outputDict in rowGenerator: formatter.formatResultDict(outputDict, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Tab-delimited file, first two labeled columns expected to represent labeled outcome (0 and non-zero) and score/probability of outcome\n"+\ " <outputFile> Tab-delimited table specifying score histogram bin widths, total cases, predicted events, actual events\n"+\ " Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option("-b", "--bins", dest="nBins", default=10, help="Number of bins to separate scores into, defaults to deciles (10)"); parser.add_option("-f", "--figure", dest="figure", help="If set, will also try to auto-generate an example figure and store to a file here"); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if len(args) > 1: inputFilename = args[0]; inputFile = stdOpen(inputFilename); # Run the actual analysis analysisResults = self(inputFile, int(options.nBins)); (hlStat, degFreedom, hlP) = self.calculateHosmerLemeshow(analysisResults); # Generate plot figure if options.figure is not None: self.generateFigure(analysisResults, options.figure); # Format the results for output outputFilename = None; if len(args) > 1: outputFilename = args[1]; outputFile = stdOpen(outputFilename,"w"); # Print comment line with arguments to allow for deconstruction later as well as extra results print >> outputFile, COMMENT_TAG, json.dumps({"argv":argv, "P-HosmerLemeshow": hlP}); colNames = self.analysisHeaders(); analysisResults.insert(0, RowItemModel(colNames,colNames) ); # Insert a mock record to get a header / label row formatter = TextResultsFormatter( outputFile ); formatter.formatResultDicts( analysisResults, colNames ); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def queryOutpatientIronRx(outputFile, patientById): log.info("Query outpatient Iron prescriptions") # Medication IDs derived by mapping through Iron as an ingredient poIronIngredientMedicationIds = (3065, 3066, 3067, 3071, 3074, 3077, 3986, 7292, 11050, 25006, 26797, 34528, 39676, 78552, 79674, 83568, 84170, 85151, 96118, 112120, 112395, 113213, 126035, 198511, 200455, 201994, 201995, 203679, 207059, 207404, 208037, 208072) # Medication IDs directly from prescriptions, formulations that did not map through RxNorm poIronDirectMedicationIds = (111354, 540526, 205010, 121171, 111320, 82791, 93962, 201795, 206722, 201068, 116045, 208725, 111341, 206637, 112400, 210256, 77529, 20844, 83798, 205523, 112428, 125474, 111343) allEnteralIronMedicationIds = set(poIronIngredientMedicationIds).union( poIronDirectMedicationIds) formatter = TextResultsFormatter(outputFile) colNames = ["pat_id", "ordering_date"] query = SQLQuery() for col in colNames: query.addSelect(col) query.addFrom("stride_order_med") query.addWhereIn("medication_id", allEnteralIronMedicationIds) query.addWhereIn("pat_id", patientById.viewkeys()) query.addOrderBy("pat_id") query.addOrderBy("ordering_date") DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
def queryLabResults(outputFile, patientById): log.info("Query out lab results, takes a while") labBaseNames = \ ( 'ferritin','fe','trfrn','trfsat','ystfrr', 'wbc','hgb','hct','mcv','rdw','plt', 'retic','reticab','ldh','hapto','tbil','ibil','dbil', 'cr','esr','crp' ) formatter = TextResultsFormatter(outputFile) # Query rapid when filter by lab result type, limited to X records. # Filtering by patient ID drags down substantially until preloaded table by doing a count on the SOR table? colNames = [ "pat_id", "base_name", "common_name", "ord_num_value", "reference_unit", "result_flag", "sor.result_time" ] query = SQLQuery() for col in colNames: query.addSelect(col) query.addFrom("stride_order_results as sor, stride_order_proc as sop") query.addWhere("sor.order_proc_id = sop.order_proc_id") query.addWhereIn("base_name", labBaseNames) query.addWhereIn("pat_id", patientById.viewkeys()) query.addOrderBy("pat_id") query.addOrderBy("sor.result_time") DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
def main_formatMergedTTests(argv): ifs = stdOpen(BASE_RESULT_DIR+CONCATENATE_FILENAME); ofs = stdOpen(BASE_RESULT_DIR+FILTERED_FILENAME, "w"); summaryData = {"argv": argv}; print >> ofs, COMMENT_TAG, json.dumps(summaryData); outputCols = ["SortType","TopicCount","VerifyTime","Group1.precision.mean","Group1.recall.mean","Group1.normalprecision.mean","Group1.weightrecall.mean","Group1.roc_auc.mean","ttest_rel.precision","ttest_rel.recall","ttest_rel.weightrecall","ttest_rel.roc_auc","Group1.numqueryitems.mean","Group1.numverifyitems.mean","Group1.numrecommendeditems.mean","Group1.tp.mean"]; formatter = TextResultsFormatter(ofs); formatter.formatTuple(outputCols); # Output header row reader = TabDictReader(ifs); for row in reader: row["SortType"] = row["Group1._s"]; # Extract out numerical data from filename text parameters row["TopicCount"] = None; if row["Group1._m"] != 'None': # Expecting model name strings of the form: "models/topicModel.first24hourItems.2013.1234567890.filter.bow.gz.64Topic.model" topicChunk = row["Group1._m"].split(".")[-2]; # Expect second to last period-delimited chunk to contain topic count topicChunk = topicChunk[:topicChunk.find("Topic")]; # Remove trailing Topic text row["TopicCount"] = int(topicChunk); # Expecting result file name argument of the form: "results/byOrderSets/01minutes/filteredResults.tab.gz" timeChunk = row["args[0]"].split("/")[-2]; timeChunk = timeChunk[:timeChunk.find("minutes")]; row["VerifyTime"] = int(timeChunk); formatter.formatResultDict(row, outputCols); ifs.close(); ofs.close();
def test_numRecsByOrderSet(self): # Designate number of recommendations indirectly via linked order set id DBUtil.execute("update clinical_item set default_recommend = 0 where clinical_item_id = -8"); # Disable default recommend on one item to shift results colNames = ["patient_id", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"]; expectedResults = [ RowItemModel([-11111, 2, 0, 3, 1.0, 0.4, 0.571, 1.0, 0.3178, 0.4167], colNames ) ]; # Do through fabricated prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); # Artificially add a key order set ID for the fabricated data modFile = StringIO(); formatter = TextResultsFormatter(modFile); dataCols = None; for i, dataRow in enumerate(TabDictReader(preparedDataFile)): dataRow["order_set_id"] = TEST_ORDERSET_ID; if i <= 0: dataCols = list(dataRow.keys()); formatter.formatTuple(dataCols); # Insert a mock record to get a header / label row formatter.formatResultDict(dataRow, dataCols); preparedDataFile = StringIO(modFile.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); #argv = ["RecommendationClassificationAnalysis.py","-P","-r","5","-m","0","-R","ItemAssociationRecommender",'-',"-"]; argv = ["RecommendationClassificationAnalysis.py","-P","--numRecsByOrderSet","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Validation file in prepared result file format use generated LDA models to predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\ " <outputFile> Validation result stat summaries.\n" parser = OptionParser(usage=usageStr) parser.add_option("-M", "--modelFile", dest="modelFile", help="Name of the file to load an LDA or HDP model and topic word document counts from."); parser.add_option("-X", "--excludeCategoryIds", dest="excludeCategoryIds", help="For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids."); parser.add_option("-i", "--itemsPerCluster", dest="itemsPerCluster", default=DEFAULT_TOPIC_ITEM_COUNT, help="Specify number of top topic items to consider when scoring recommendations."); parser.add_option("-m", "--minClusterWeight", dest="minClusterWeight", default=DEFAULT_MIN_TOPIC_WEIGHT, help="When scoring recommendations, skip any topics with less than this relation weight (effectively scores as zero, but can avoid a lot of low yield calculations)."); parser.add_option("-s", "--sortField", dest="sortField", default=DEFAULT_SORT_FIELD, help="Score field to sort top recommendations by. Default to posterior probabilty 'totelItemWeight', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting."); parser.add_option("-r", "--numRecs", dest="numRecs", default=DEFAULT_RECOMMENDED_ITEM_COUNT, help="Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size."); parser.add_option("-O", "--numRecsByOrderSet", dest="numRecsByOrderSet", action="store_true", help="If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if len(args) >= 1: query = AnalysisQuery(); query.preparedPatientItemFile = stdOpen(args[0]); query.recommender = TopicModelRecommender(options.modelFile); query.baseRecQuery = RecommenderQuery(); if options.excludeCategoryIds is not None: query.baseRecQuery.excludeCategoryIds = set(); for categoryIdStr in options.executeCategoryIds.split(","): query.baseRecQuery.excludeCategoryIds.add(int(categoryIdStr)); else: # Default exclusions if none specified query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(); query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(); query.baseRecQuery.itemsPerCluster = int(options.itemsPerCluster); query.baseRecQuery.minClusterWeight = float(options.minClusterWeight); query.baseRecQuery.sortField = options.sortField; query.numRecommendations = int(options.numRecs); query.numRecsByOrderSet = options.numRecsByOrderSet; # Run the actual analysis analysisResults = self(query); # Format the results for output outputFilename = None; if len(args) > 1: outputFilename = args[1]; outputFile = stdOpen(outputFilename,"w"); # Print comment line with analysis arguments to allow for deconstruction later summaryData = {"argv": argv}; print(COMMENT_TAG, json.dumps(summaryData), file=outputFile); formatter = TextResultsFormatter( outputFile ); colNames = self.resultHeaders(query); formatter.formatTuple( colNames ); # Insert a mock record to get a header / label row formatter.formatResultDicts( analysisResults, colNames ); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def outputSummaryRecords(self, summaryRecords, outputFile): # Field names with or without using data control links instead of just raw values headers = list(self.summaryHeaders) controlHeaders = list(self.summaryHeaders) for questionModule in self.questionModules: headers.append(questionModule.getName()) controlHeaders.append(questionModule.getName() + ".link") print('''<br> <table class="dataTable" cellspacing=0 cellpadding=4 style="width: 100%"> <tr><th class="subheading" colspan=100>Summary Table</th></tr>''', file=outputFile) textAreaRows = 50 if not self.skipDetail: # HTML table form with links back to records formatter = HtmlResultsFormatter( outputFile, headerRowFormat='th class="labelCell"', align="center") formatter.formatTuple(headers) # Header row for summaryRecord in summaryRecords: for questionModule in self.questionModules: questionName = questionModule.getName() linkFieldName = questionName + ".link" summaryRecord[linkFieldName] = ( '<a href="javascript:setQuestionsByName(\'' + questionName + '\', %(iRecord)s)">%(' + questionName + ')s</a>') % summaryRecord formatter.formatResultDict(summaryRecord, controlHeaders) textAreaRows = 5 # If showing detail records, pay less attention to the raw text area # Raw result content for copy-paste to spreadsheet print( '''<tr><td class="labelCell" style="color: 808080" colspan=100>Raw Table (Select All and Copy-Paste to Spreadsheet)</td></tr>''', file=outputFile) print( '''<tr><td colspan=100><textarea style="width: 100%%;" disabled rows=%d>''' % textAreaRows, file=outputFile) formatter = TextResultsFormatter(outputFile) formatter.formatTuple(headers) for summaryRecord in summaryRecords: formatter.formatResultDict(summaryRecord, headers) print('''</textarea></td></tr>''', file=outputFile) print('''</table>''', file=outputFile) print("%d Records Processed" % len(summaryRecords), file=outputFile)
def queryClinicalItems(outputFile, clinicalItemIds, patientById): log.info("Query Clinical Items: %s" % str(clinicalItemIds)) formatter = TextResultsFormatter(outputFile) colNames = ["patient_id", "item_date"] query = SQLQuery() for col in colNames: query.addSelect(col) query.addFrom("patient_item") query.addWhereIn("clinical_item_id", clinicalItemIds) query.addWhereIn("patient_id", patientById.viewkeys()) query.addOrderBy("patient_id") query.addOrderBy("item_date") DBUtil.execute(query, includeColumnNames=True, formatter=formatter)
def main_formatResults(argv): ifs = stdOpen(BASE_RESULT_DIR + FILTERED_FILENAME) ofs = stdOpen(BASE_RESULT_DIR + FORMATTED_FILENAME, "w") summaryData = { "argv": argv } print >> ofs, COMMENT_TAG, json.dumps(summaryData) outputCols = [ "SortType", "TopicCount", "TrainTime", "VerifyTime", "precision", "recall", "normalprecision", "weightrecall", "roc_auc" ] formatter = TextResultsFormatter(ofs) formatter.formatTuple(outputCols) # Output header row reader = TabDictReader(ifs) for row in reader: row["SortType"] = row["_s"] # Extract out numerical data from filename text parameters row["TopicCount"] = None row["TrainTime"] = None if row["_m"] != 'None': # Expecting model name strings of the form: "models/topicModel.firstItems.q14400.v14400.2013.1234567890.filter.bow.gz.16Topic.model" chunks = row["_m"].split(".") topicChunk = chunks[-2] # Expect second to last period-delimited chunk to contain topic count topicChunk = topicChunk[:topicChunk.find("Topic")] # Remove trailing Topic text row["TopicCount"] = int(topicChunk) for chunk in chunks: if chunk[0] == "q" and chunk[-1].isdigit( ): # This should be the query time in seconds queryTimeSeconds = int(chunk[1:]) queryTimeMinutes = queryTimeSeconds / 60 row["TrainTime"] = queryTimeMinutes # Expecting training file name argument of the form: "sourceData/first24hourOrderSets.2013.q86400.v14400.-12345.tab.gz" row["VerifyTime"] = None for chunk in row["args_0_"].split("."): if chunk[0] == "v" and chunk[-1].isdigit( ): # This should be the verify time in seconds verifyTimeSeconds = int(chunk[1:]) verifyTimeMinutes = verifyTimeSeconds / 60 row["VerifyTime"] = verifyTimeMinutes formatter.formatResultDict(row, outputCols) ifs.close() ofs.close()
def test_performance(self): """ Test performance against DataExtractor. """ # Initialize DB cursor. cursor = self.connection.cursor() # Initialize FeatureMatrixFactory. factoryStart = time.time() self.factory = FeatureMatrixFactory() # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereIn("proc_code", ["Foo", "Bar", "Baz", "Qux"]) patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Set and process patientEpisodeInput. self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time") self.factory.processPatientEpisodeInput() # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) # Add clinical item features. self.factory.addClinicalItemFeatures(["PerfItem300"]) self.factory.addClinicalItemFeatures(["PerfItem400"]) self.factory.addClinicalItemFeatures(["PerfItem500"]) # Add lab result features. self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta, postTimeDelta) self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta, postTimeDelta) # Add flowsheet features. self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta, postTimeDelta) # Build matrix. self.factory.buildFeatureMatrix() # Stop timer. factoryStop = time.time() # Initialize DataExtractor. extractorStart = time.time() extractor = DataExtractor() extractor.dataCache = dict() # Initialize output file. outFile = open("extractor.feature_matrix.tab.gz", "w") formatter = TextResultsFormatter(outFile) # Build SQL query for list of patient episodes. patientEpisodeQuery = SQLQuery() patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)") patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id") patientEpisodeQuery.addSelect("proc_code") patientEpisodeQuery.addSelect("order_time") patientEpisodeQuery.addSelect( "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results" ) patientEpisodeQuery.addFrom("stride_order_proc AS sop") patientEpisodeQuery.addFrom("stride_order_results AS sor") patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id") patientEpisodeQuery.addWhereIn("proc_code", ["Foo", "Bar", "Baz", "Qux"]) patientEpisodeQuery.addGroupBy( "pat_id, sop.order_proc_id, proc_code, order_time") patientEpisodeQuery.addOrderBy( "pat_id, sop.order_proc_id, proc_code, order_time") cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params) # Process patient episodes. patientEpisodes = list() row = cursor.fetchone() while row is not None: (pat_id, order_proc_id, proc_code, order_time, normal_results) = row patientEpisode = \ RowItemModel \ ( { "patient_id": pat_id, "order_proc_id": order_proc_id, "proc_code": proc_code, "order_time": order_time, "result_normal_count": normal_results } ) patientEpisodes.append(patientEpisode) row = cursor.fetchone() # Initialize patient data. lastPatientId = None colNames = None patientEpisodeByIndexTime = None # Look for lab data 90 days before each episode, but never afterself. preTimeDelta = datetime.timedelta(-90) postTimeDelta = datetime.timedelta(0) # Populate patient data. tempColNames = \ ["patient_id", "order_proc_id", "proc_code", "order_time", "result_normal_count"] for patientEpisode in patientEpisodes: patientId = patientEpisode["patient_id"] if lastPatientId is not None and lastPatientId != patientId: # New patient ID so start querying for patient specific data and # populating patient episode data. # Clinical Item (PerfItem300) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem300",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem400) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem400",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem500) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem500",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Lab Result (Foo) labResultTable = extractor.queryLabResults(["Foo"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \ preTimeDelta, postTimeDelta)) # Lab Result (Bar) labResultTable = extractor.queryLabResults(["Bar"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \ preTimeDelta, postTimeDelta)) # Lab Result (Baz) labResultTable = extractor.queryLabResults(["Baz"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \ preTimeDelta, postTimeDelta)) # Lab Result (Qux) labResultTable = extractor.queryLabResults(["Qux"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \ preTimeDelta, postTimeDelta)) # Flowsheet (Perflow) # tempFile = StringIO() # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile) # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\ # StringIO(tempFile.getvalue())) # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\ # patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \ # ["Perflow"], preTimeDelta, postTimeDelta, tempColNames)) if colNames is None: # First row, print header row colNames = tempColNames formatter.formatTuple(colNames) # Print out patient (episode) data (one row per episode) formatter.formatResultDicts(patientEpisodeByIndexTime.values(), colNames) if lastPatientId is None or lastPatientId != patientId: # Prepare to aggregate patient episode record per patient patientEpisodeByIndexTime = dict() patientEpisodeByIndexTime[ patientEpisode["order_time"]] = patientEpisode lastPatientId = patientId outFile.flush() # Last Iteration patientId = lastPatientId # Clinical Item (PerfItem300) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem300",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem400) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem400",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Clinical Item (PerfItem500) eventTimes = extractor.parseClinicalItemData_singlePatient(\ modelListFromTable(extractor.queryClinicalItemsByName(\ ("PerfItem500",), [patientId]))) tempColNames.extend(\ extractor.addClinicalItemFeatures_singlePatient(\ eventTimes, patientEpisodeByIndexTime, "PerfItem300", \ daysBins=[])) # Lab Result (Foo) labResultTable = extractor.queryLabResults(["Foo"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \ preTimeDelta, postTimeDelta)) # Lab Result (Bar) labResultTable = extractor.queryLabResults(["Bar"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \ preTimeDelta, postTimeDelta)) # Lab Result (Baz) labResultTable = extractor.queryLabResults(["Baz"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \ preTimeDelta, postTimeDelta)) # Lab Result (Qux) labResultTable = extractor.queryLabResults(["Qux"], [patientId]) labsByBaseName = extractor.parseLabResultsData_singlePatient(\ modelListFromTable(labResultTable)) tempColNames.extend(extractor.addLabFeatures_singlePatient(\ patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \ preTimeDelta, postTimeDelta)) formatter.formatResultDicts(patientEpisodeByIndexTime.values(), colNames) # Close file. outFile.close() # Stop timer. extractorStop = time.time() # Compare results. factoryTime = factoryStop - factoryStart extractorTime = extractorStop - extractorStart self.assertTrue(extractorTime > factoryTime) # Clean up feature matrix files. try: os.remove("extractor.feature_matrix.tab.gz") except OSError: pass try: os.remove(self.factory.getMatrixFileName()) except OSError: pass
def queryPatientEpisodes(outputFile, extractor): log.info( "Select patient admissions with provider category of Tt Pamf Med (Primary) or Tt Med Univ (Primary)" ) conn = DBUtil.connection() cursor = conn.cursor() try: # # Clinical item category for admission diagnoses # # ADMIT_DX_CATEGORY_ID = 2; # admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0]; # # Look for items indicating suspected infection / sepsis # ivAntibioticItemIds = loadIVAntibioticItemIds(extractor); # bloodCultureItemIds = loadBloodCultureItemIds(extractor); # respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor); # # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected # suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds)); # suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]); # Convert to comma-separated string via a str.join function on list contracture # # Look for primary surgery teams to exclude # excludeTeamCategory = "SurgerySpecialty"; # excludeTreatmentTeams = list(); # for row in extractor.loadMapData("TreatmentTeamGroups"): # if row["team_category"] == excludeTeamCategory: # excludeTreatmentTeams.append(row["treatment_team"]); # query = SQLQuery(); # query.addSelect("clinical_item_id"); # query.addFrom("clinical_item"); # query.addWhereIn("description", excludeTreatmentTeams ); # excludeTeamItemIds = set(); # for row in DBUtil.execute(query, conn=conn): # excludeTeamItemIds.add(row[0]); # excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]); # Convert to comma-separated string via a str.join function on list contracture # First pass query to get the list of patients and emergency department presentation times cohortQuery = \ """ select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime from stride_adt as adt1, stride_adt as adt2 where adt1.pat_anon_id in (select patient_id from patient_item inner join clinical_item on patient_item.clinical_item_id = clinical_item.clinical_item_id where clinical_item.clinical_item_category_id = 161 AND clinical_item.description = '%s') and adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id """ % ("Tt Pamf Med (Primary)") print(cohortQuery, file=sys.stderr) cursor.execute(cohortQuery) patientEpisodes = list() patientEpisodeById = dict() # Collect Build basic patient ID and # ED presentation dates and Discharge date/time prog = ProgressDots() row = cursor.fetchone() while row is not None: (patientId, encounterId, edAdmitTime, dischargeTime) = row #patientId = int(patientId); patientEpisode = \ RowItemModel \ ( { "patient_id":patientId, "edAdmitTime":edAdmitTime, "dischargeTime":dischargeTime, "encounter_id":encounterId, "payorTitle": None, # Default encounter data to null in case can't find it later "bpSystolic": None, "bpDiastolic": None, "temperature": None, "pulse": None, "respirations": None, } ) patientEpisodes.append(patientEpisode) if patientEpisode["encounter_id"] not in patientEpisodeById: patientEpisodeById[ patientEpisode["encounter_id"]] = patientEpisode prog.update() row = cursor.fetchone() prog.printStatus() # Second query phase to link to encounter information (e.g., insurance, admitting vital signs) encounterIds = columnFromModelList(patientEpisodes, "encounter_id") query = SQLQuery() query.addSelect("pat_id") query.addSelect("pat_enc_csn_id") query.addSelect("title") query.addSelect("bp_systolic") query.addSelect("bp_diastolic") query.addSelect("temperature") query.addSelect("pulse") query.addSelect("respirations") query.addFrom("stride_patient_encounter") query.addWhereIn("pat_enc_csn_id", encounterIds) cursor.execute(str(query), query.params) row = cursor.fetchone() while row is not None: (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic, temperature, pulse, respirations) = row if encounterId in patientEpisodeById: patientEpisode = patientEpisodeById[encounterId] if patientEpisode["payorTitle"] is None: patientEpisode["payorTitle"] = set() # Single encounters may have multiple payors to track patientEpisode["payorTitle"].add(payorTitle) patientEpisode["bpSystolic"] = bpSystolic patientEpisode["bpDiastolic"] = bpDiastolic patientEpisode["temperature"] = temperature patientEpisode["pulse"] = pulse patientEpisode["respirations"] = respirations row = cursor.fetchone() # Drop results as tab-delimited text output formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(patientEpisodes, addHeaderRow=True) return patientEpisodes finally: cursor.close() conn.close()
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> <outputFile>\n"+\ " <inputFile> Tab-delimited file of data\n"+\ " <ouputFile> Tab-delimited file with relational table of t-test p-values for each sub-group pair. Specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-l", "--labelCols", dest="labelCols", help= "Comma-separated list of the column headers to label data rows as belonging to different subgroups" ) parser.add_option( "-v", "--valueCols", dest="valueCols", help= "Comma-separated list of the column headers for data values want to calculate statistics for" ) parser.add_option( "-m", "--matchCols", dest="matchCols", help= "Comma-separated list of the column headers to match groups on, like row identifiers. If not exists, then do independent t-tests rather than paired." ) parser.add_option( "-b", "--baseLabels", dest="baseLabels", help= "Comma-separated list of values that the labelCols should have to represent which base method to compare all other methods to as a reference (otherwise do a full n^2 cartesian product of all combinations)." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 1: inputFile = stdOpen(args[0]) outputFile = stdOpen(args[1], "w") labelCols = options.labelCols.split(",") valueCols = options.valueCols.split(",") matchCols = None if options.matchCols: matchCols = options.matchCols.split(",") baseLabels = None if options.baseLabels: baseLabels = options.baseLabels.split(",") # Print comment line with arguments to allow for deconstruction later as well as extra results summaryData = { "argv": argv } print >> outputFile, COMMENT_TAG, json.dumps(summaryData) # Tab-delimited output formatting formatter = TextResultsFormatter(outputFile) # Prep generator first, so will be able to extract out relevant header columns rowGenerator = self(inputFile, labelCols, valueCols, matchCols, baseLabels) # Insert a mock record to get a header / label row colNames = self.resultHeaders(labelCols, valueCols, matchCols) formatter.formatResultDict(RowItemModel(colNames, colNames), colNames) # Stream the concatenated data rows to the output to avoid storing all in memory for outputDict in rowGenerator: formatter.formatResultDict(outputDict, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def queryPatientEpisodes(outputFile, extractor): log.info("Select patient admissions with possible/probable sepsis within 24 hours of admission (long query >60 min?)..."); conn = DBUtil.connection(); cursor = conn.cursor(); try: # Clinical item category for admission diagnoses # ADMIT_DX_CATEGORY_ID = 2; admitDxCategoryId = DBUtil.execute("select clinical_item_category_id from clinical_item_category where description like '%%ADMIT_DX%%'", conn=conn)[0][0]; # Look for items indicating suspected infection / sepsis ivAntibioticItemIds = loadIVAntibioticItemIds(extractor); bloodCultureItemIds = loadBloodCultureItemIds(extractor); respiratoryViralPanelItemIds = loadRespiratoryViralPanelItemIds(extractor); # Merge IV antibiotics and blood cultures, respiratory panels as items that suggest sepsis is suspected suspectSepsisItemIds = ivAntibioticItemIds.union(bloodCultureItemIds.union(respiratoryViralPanelItemIds)); suspectSepsisItemIdsStr = str.join(',', [str(itemId) for itemId in suspectSepsisItemIds]); # Convert to comma-separated string via a str.join function on list contracture # Look for primary surgery teams to exclude excludeTeamCategory = "SurgerySpecialty"; excludeTreatmentTeams = list(); for row in extractor.loadMapData("TreatmentTeamGroups"): if row["team_category"] == excludeTeamCategory: excludeTreatmentTeams.append(row["treatment_team"]); query = SQLQuery(); query.addSelect("clinical_item_id"); query.addFrom("clinical_item"); query.addWhereIn("description", excludeTreatmentTeams ); excludeTeamItemIds = set(); for row in DBUtil.execute(query, conn=conn): excludeTeamItemIds.add(row[0]); excludeTeamItemIdsStr = str.join(',', [str(itemId) for itemId in excludeTeamItemIds]); # Convert to comma-separated string via a str.join function on list contracture # First pass query to get the list of patients and emergency department presentation times cohortQuery = \ """ -- Pick out date(s) when admitted through emergency department and matching discharge time select adt1.pat_anon_id, adt1.pat_enc_csn_anon_id, adt1.shifted_transf_in_dt_tm as edAdmitTime, adt2.shifted_transf_out_dt_tm as dischargeTime from stride_adt as adt1, stride_adt as adt2 where -- Admission event adt1.department_in = 'EMERGENCY DEPARTMENT' and adt1.event_in = 'Admission' and adt1.pat_anon_id in ( -- Select any patient with any suspected sepsis related order (i.e., IV antibiotics or blood cultures) select patient_id from patient_item as pi where pi.clinical_item_id in (%s) except -- Exclude any patient who has been on a primary surgery team select patient_id from patient_item where clinical_item_id in (%s) -- -12434586418575,-12432455207729,-12428492282572,-12428492282572,-12424048595257,-12414081679705 ) and adt1.pat_enc_csn_anon_id = adt2.pat_enc_csn_anon_id and -- Discharge event adt2.event_out = 'Discharge' order by adt1.shifted_transf_in_dt_tm """ % (suspectSepsisItemIdsStr, excludeTeamItemIdsStr); print >> sys.stderr, cohortQuery; cursor.execute(cohortQuery); patientEpisodes = list(); patientEpisodeById = dict(); # Collect Build basic patient ID and # ED presentation dates and Discharge date/time prog = ProgressDots(); row = cursor.fetchone(); while row is not None: (patientId, encounterId, edAdmitTime, dischargeTime) = row; #patientId = int(patientId); patientEpisode = \ RowItemModel \ ( { "patient_id":patientId, "edAdmitTime":edAdmitTime, "dischargeTime":dischargeTime, "encounter_id":encounterId, "payorTitle": None, # Default encounter data to null in case can't find it later "bpSystolic": None, "bpDiastolic": None, "temperature": None, "pulse": None, "respirations": None, } ); patientEpisodes.append(patientEpisode); if patientEpisode["encounter_id"] not in patientEpisodeById: patientEpisodeById[patientEpisode["encounter_id"]] = patientEpisode; prog.update(); row = cursor.fetchone(); prog.printStatus(); # Second query phase to link to encounter information (e.g., insurance, admitting vital signs) encounterIds = columnFromModelList(patientEpisodes, "encounter_id"); query = SQLQuery(); query.addSelect("pat_id"); query.addSelect("pat_enc_csn_id"); query.addSelect("title"); query.addSelect("bp_systolic"); query.addSelect("bp_diastolic"); query.addSelect("temperature"); query.addSelect("pulse"); query.addSelect("respirations"); query.addFrom("stride_patient_encounter"); query.addWhereIn("pat_enc_csn_id", encounterIds); cursor.execute(str(query), query.params); row = cursor.fetchone(); while row is not None: (patientId, encounterId, payorTitle, bpSystolic, bpDiastolic, temperature, pulse, respirations) = row; if encounterId in patientEpisodeById: patientEpisode = patientEpisodeById[encounterId]; if patientEpisode["payorTitle"] is None: patientEpisode["payorTitle"] = set(); # Single encounters may have multiple payors to track patientEpisode["payorTitle"].add(payorTitle); patientEpisode["bpSystolic"] = bpSystolic; patientEpisode["bpDiastolic"] = bpDiastolic; patientEpisode["temperature"] = temperature; patientEpisode["pulse"] = pulse; patientEpisode["respirations"] = respirations; row = cursor.fetchone(); # Drop results as tab-delimited text output formatter = TextResultsFormatter(outputFile); formatter.formatResultDicts(patientEpisodes, addHeaderRow=True); return patientEpisodes; finally: cursor.close(); conn.close();
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <recommenderName> <patientIds> [<outputFile>]\n"+\ " <patientIds/dataFile> Name of file with patient ids. If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for. Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\ " <outputFile> If query yields a result set, then that will be output\n"+\ " to the named file. Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-c", "--baseCategoryId", dest="baseCategoryId", help= "ID of clinical item category to look for initial items from (probably the ADMIT Dx item)." ) parser.add_option( "-Q", "--queryTimeSpan", dest="queryTimeSpan", help= "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above. Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)." ) parser.add_option( "-o", "--outcomeItemIds", dest="outcomeItemIds", help= "Comma separated list of outcome item IDs to get prediction / recommendation scores for, as well as to label whether they actually appeared for the given patients. Can specify virtual items representing the end of item triples (e.g., 5-Readmission being the end of any item followed by 3591-Discharge then 3671-Admit), by adding the component items in expected sequence. For example, '5=3591:3671'" ) parser.add_option( "-t", "--timeDeltaMax", dest="timeDeltaMax", help= "Time delta in seconds maximum by which recommendations should be based on. Defaults to recommending items that occur at ANY time after the key orders. If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items. If set, will also only count presence of labeled target items if occurs within the given time delta of the first query item." ) parser.add_option( "-P", "--preparedPatientItemFile", dest="preparedPatientItemFile", action="store_true", help= "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database." ) parser.add_option( "-R", "--recommender", dest="recommender", help= "Name of the recommender to run the analysis against. Options: %s" % list(RECOMMENDER_CLASS_BY_NAME.keys())) parser.add_option( "-S", "--scoreField", dest="scoreField", help= "Name of (derived) field to score items by. For example, 'conditionalFreq.'" ) parser.add_option( "-p", "--countPrefix", dest="countPrefix", help= "Which counting method to use for item associations. Defaults to counting item occurrences, allowing for duplicates. Additional options include: %s." % list(COUNT_PREFIX_OPTIONS)) parser.add_option( "-a", "--aggregationMethod", dest="aggregationMethod", help= "Aggregation method to use for recommendations based off multiple query items. Options: %s." % list(AGGREGATOR_OPTIONS)) parser.add_option( "-s", "--skipIfOutcomeInQuery", dest="skipIfOutcomeInQuery", action="store_true", help= "If set, will skip patients where the outcome item occurs during the query period since that would defy the point of predicting the outcome." ) parser.add_option( "-m", "--maxRecommendedId", dest="maxRecommendedId", help= "Specify a maximum ID value to accept for recommended items. More used to limit output in test cases" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 0: # Parse out the query parameters query = AnalysisQuery() query.recommender = RECOMMENDER_CLASS_BY_NAME[ options.recommender]() query.recommender.dataManager.dataCache = dict() # Use local cache to speed up repeat queries query.baseRecQuery = RecommenderQuery() if options.preparedPatientItemFile: # Don't reconstruct validation data through database, just read off validation file query.preparedPatientItemFile = stdOpen(args[0]) else: patientIdsParam = args[0] try: # Try to open patient IDs as a file patientIdFile = stdOpen(patientIdsParam) query.patientIds = set(patientIdFile.read().split()) except IOError: # Unable to open as a filename, then interpret as simple comma-separated list query.patientIds = set(patientIdsParam.split(",")) query.baseCategoryId = int(options.baseCategoryId) # Category to look for clinical item to start accruing query items from query.queryTimeSpan = timedelta(0, int(options.queryTimeSpan)) query.baseRecQuery.targetItemIds = set() outcomeIdStrList = options.outcomeItemIds.split(",") for outcomeIdStr in outcomeIdStrList: outcomeIdComponents = outcomeIdStr.split("=") outcomeId = int(outcomeIdComponents[0]) query.baseRecQuery.targetItemIds.add(outcomeId) if len(outcomeIdComponents) > 1: sequenceIds = [ int(seqIdStr) for seqIdStr in outcomeIdComponents[1].split(":") ] query.sequenceItemIdsByVirtualItemId[ outcomeId] = tuple(sequenceIds) if options.timeDeltaMax is not None: query.baseRecQuery.timeDeltaMax = timedelta( 0, int(options.timeDeltaMax)) if options.scoreField is not None: query.baseRecQuery.sortField = options.scoreField if options.countPrefix is not None: query.baseRecQuery.countPrefix = options.countPrefix if options.aggregationMethod is not None: query.baseRecQuery.aggregationMethod = options.aggregationMethod if options.maxRecommendedId is not None: query.baseRecQuery.maxRecommendedId = int( options.maxRecommendedId) if options.skipIfOutcomeInQuery is not None: query.skipIfOutcomeInQuery = options.skipIfOutcomeInQuery # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile) colNames = self.analysisHeaders(query) analysisResults.insert(0, RowItemModel(colNames, colNames)) # Insert a mock record to get a header / label row formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <patientIds/dataFile> [<outputFile>]\n"+\ " <patientIds/dataFile> Name of file with patient ids. If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for. Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\ " <outputFile> If query yields a result set, then that will be output\n"+\ " to the named file. Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-q", "--numQuery", dest="numQuery", help= "Number of orders / items from each patient to use as query items to prime the recommendations. If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items" ) parser.add_option( "-v", "--numVerify", dest="numVerify", help= "Number of orders / items from each patient after the query items to use to validate recommendations. If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items. If left unset, then just use all remaining orders / items for that patient" ) parser.add_option( "-c", "--baseCategoryId", dest="baseCategoryId", help= "Instead of specifying first nQ query items, specify ID of clinical item category to look for initial items from (probably the ADMIT Dx item)." ) parser.add_option( "-b", "--baseItemId", dest="baseItemId", help= "Instead of specifying first nQ query items, specify ID of the specific clinical item to look for initial items from." ) parser.add_option( "-S", "--startDate", dest="startDate", help="Only look for test data occuring on or after this start date." ) parser.add_option( "-E", "--endDate", dest="endDate", help="Only look for test data occuring before this end date.") parser.add_option( "-Q", "--queryTimeSpan", dest="queryTimeSpan", help= "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above. Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)." ) parser.add_option( "-V", "--verifyTimeSpan", dest="verifyTimeSpan", help= "Time frame specified in seconds over which to look for verify items after initial query item time. Will ignore the query items that occur within the queryTimeSpan." ) parser.add_option( "-P", "--preparedPatientItemFile", dest="preparedPatientItemFile", action="store_true", help= "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database." ) parser.add_option( "-R", "--recommender", dest="recommender", help= "Name of the recommender to run the analysis against. Options: %s" % list(RECOMMENDER_CLASS_BY_NAME.keys())) parser.add_option( "-r", "--numRecs", dest="numRecs", help= "Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size." ) parser.add_option( "-O", "--numRecsByOrderSet", dest="numRecsByOrderSet", action="store_true", help= "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider." ) parser.add_option( "-s", "--sortField", dest="sortField", help= "Allow overriding of default sort field when returning ranked results" ) parser.add_option( "-f", "--fieldFilters", dest="fieldFilters", help= "Filters to exclude results. Comma-separated separated list of field-op:value exclusions where op is either < or > like, conditionalFreq<:0.1,frqeRatio<:1" ) parser.add_option( "-t", "--timeDeltaMax", dest="timeDeltaMax", help= "If set, represents a time delta in seconds maximum by which recommendations should be based on. Defaults to recommending items that occur at ANY time after the key orders. If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items." ) parser.add_option( "-a", "--aggregationMethod", dest="aggregationMethod", help= "Aggregation method to use for recommendations based off multiple query items. Options: %s." % list(AGGREGATOR_OPTIONS)) parser.add_option( "-p", "--countPrefix", dest="countPrefix", help= "Prefix for how to do counts. Blank for default item counting allowing repeats, otherwise ignore repeats for patient_ or encounter_" ) parser.add_option( "-m", "--maxRecommendedId", dest="maxRecommendedId", help= "Specify a maximum ID value to accept for recommended items. More used to limit output in test cases" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) >= 1: # Parse out the query parameters query = AnalysisQuery() query.recommender = RECOMMENDER_CLASS_BY_NAME[ options.recommender]() query.recommender.dataManager.dataCache = dict() # Use a dataCache to facilitate repeat queries if options.preparedPatientItemFile: # Don't reconstruct validation data through database, just read off validation file query.preparedPatientItemFile = stdOpen(args[0]) else: patientIdsParam = args[0] try: # Try to open patient IDs as a file patientIdFile = stdOpen(patientIdsParam) query.patientIds = set(patientIdFile.read().split()) except IOError: # Unable to open as a filename, then interpret as simple comma-separated list query.patientIds = set(patientIdsParam.split(",")) if options.numQuery is not None: query.numQueryItems = int(options.numQuery) query.numVerifyItems = int(options.numVerify) else: # Alternative to specify query time span starting from a key category query.queryTimeSpan = timedelta(0, int(options.queryTimeSpan)) query.verifyTimeSpan = timedelta( 0, int(options.verifyTimeSpan)) if options.baseCategoryId is not None or options.baseItemId is not None: if options.baseCategoryId is not None: query.baseCategoryId = int(options.baseCategoryId) # Category to look for clinical item to start accruing query items from if options.baseItemId is not None: query.baseItemId = int(options.baseItemId) if options.startDate is not None: query.startDate = DBUtil.parseDateValue(options.startDate) if options.endDate is not None: query.endDate = DBUtil.parseDateValue(options.endDate) query.baseRecQuery = RecommenderQuery() query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds( ) query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds( ) if options.timeDeltaMax is not None and len( options.timeDeltaMax) > 0: query.baseRecQuery.timeDeltaMax = timedelta( 0, int(options.timeDeltaMax)) if options.aggregationMethod is not None: query.baseRecQuery.aggregationMethod = options.aggregationMethod if options.countPrefix is not None: query.baseRecQuery.countPrefix = options.countPrefix if options.maxRecommendedId is not None: query.baseRecQuery.maxRecommendedId = int( options.maxRecommendedId) if options.sortField is not None: query.baseRecQuery.sortField = options.sortField if options.fieldFilters is not None: for fieldFilterStr in options.fieldFilters.split(","): (fieldOp, valueStr) = fieldFilterStr.split(":") query.baseRecQuery.fieldFilters[fieldOp] = float(valueStr) if options.numRecs is not None: query.numRecommendations = int(options.numRecs) else: # No recommendation count specified, then just use the same as the verify number query.numRecommendations = query.numVerifyItems query.numRecsByOrderSet = options.numRecsByOrderSet # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) formatter = TextResultsFormatter(outputFile) colNames = self.resultHeaders(query) formatter.formatTuple(colNames) # Insert a mock record to get a header / label row formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Validation file in prepared result file format. Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\ " <outputFile> Validation result stat summaries.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-X", "--excludeCategoryIds", dest="excludeCategoryIds", help= "For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids." ) parser.add_option( "-s", "--sortField", dest="sortField", default=DEFAULT_SORT_FIELD, help= "Score field to sort top recommendations by. Default to posterior probabilty / positive predictive value 'P(B|A)', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting." ) parser.add_option( "-r", "--numRecs", dest="numRecs", default=DEFAULT_RECOMMENDED_ITEM_COUNT, help= "Number of orders / items to recommend for comparison against the verification set." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) >= 1: query = AnalysisQuery() query.preparedPatientItemFile = stdOpen(args[0]) query.recommender = OrderSetRecommender() query.baseRecQuery = RecommenderQuery() if options.excludeCategoryIds is not None: query.baseRecQuery.excludeCategoryIds = set() for categoryIdStr in options.executeCategoryIds.split(","): query.baseRecQuery.excludeCategoryIds.add( int(categoryIdStr)) else: # Default exclusions if none specified query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds( ) query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds( ) query.baseRecQuery.sortField = options.sortField query.numRecommendations = int(options.numRecs) # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) formatter = TextResultsFormatter(outputFile) colNames = self.resultHeaders(query) formatter.formatTuple(colNames) # Insert a mock record to get a header / label row formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
"Charlson.HemiplegiaParaplegia.pre", "Charlson.LiverMild.pre", "Charlson.LiverModSevere.pre", "Charlson.Malignancy.pre", "Charlson.MalignancyMetastatic.pre", "Charlson.MI.pre", "Charlson.PepticUlcer.pre", "Charlson.PeripheralVascular.pre", "Charlson.Renal.pre", "Charlson.Rheumatic.pre", "self_pay", "PO2A.last", "Pulse.last", "NA.last", "CR.last", "HCT.last", "WBC.last", "BUN.last", "TBIL.last", "K.last", "Resp.last", "Temp.last", "Urine.last", "BP_Low_Diastolic.last", "BP_High_Systolic.last", "Glasgow.Coma.Scale.Score.last", "TT.Cardiology.pre", "TT.CCU.HF.pre", "TT.CCU.pre", "TT.HemeOnc.pre", "TT.Medicine.pre", "TT.MICU.pre", "TT.Neurology.pre", "TT.SICU.pre", "TT.SurgerySpecialty.pre", "TT.Transplant.pre", "TT.Trauma.pre", "self_pay" ] ofs = stdOpen("simulatedData.ICUDNR.tab", "w") formatter = TextResultsFormatter(ofs) formatter.formatTuple(colNames) # Header row random.seed(987654321) # Consistent seed for reproducibility nPatients = 10000 # Random generator parameters ageRange = [30, 80] incomeRange = [20000, 200000] incomeStep = 1000 femaleRate = 0.5 # Ranges on uniform distribution to assign race labels. Leave ~50% empty for default White race raceRangesByLabel = \
#import mechanize #import cookielib from BeautifulSoup import BeautifulSoup import urllib from medinfo.common.Util import ProgressDots, stdOpen from medinfo.db.ResultsFormatter import TextResultsFormatter BASE_FILENAME = 'buprenorphinePhysicians.%s.htm' N_PAGES = 641 #N_PAGES = 5 OUTPUT_FILENAME = 'prescribers.suboxone.tab' ofs = stdOpen(OUTPUT_FILENAME, "w") formatter = TextResultsFormatter(ofs) colNames = list() allColsSeen = False progress = ProgressDots(big=100, small=2) for iPage in xrange(N_PAGES): localFilename = BASE_FILENAME % (iPage) localFile = open(localFilename) html = localFile.read() localFile.close() soup = BeautifulSoup(html) cells = soup("td") currRow = list()
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Validation file in prepared result file format. Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\ " <outputFile> Validation result stat summaries.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-r", "--numRecs", dest="numRecs", default=DEFAULT_RECOMMENDED_ITEM_COUNT, help= "Number of orders / items to recommend for comparison against the verification set, sorted in prevalence order. If skip or set <1, then will use all order set items found." ) parser.add_option( "-O", "--numRecsByOrderSet", dest="numRecsByOrderSet", action="store_true", help= "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider." ) parser.add_option( "-s", "--sortField", dest="sortField", default=DEFAULT_SORT_FIELD, help= "Allow overriding of default sort field when returning ranked results (patient_count, name, description, etc.)" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) >= 1: query = AnalysisQuery() query.preparedPatientItemFile = stdOpen(args[0]) query.recommender = OrderSetRecommender() query.baseRecQuery = RecommenderQuery() # Default exclusions if none specified query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds( ) query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds( ) query.baseRecQuery.sortField = options.sortField query.numRecommendations = int(options.numRecs) query.numRecsByOrderSet = options.numRecsByOrderSet # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) formatter = TextResultsFormatter(outputFile) colNames = self.resultHeaders(query) formatter.formatTuple(colNames) # Insert a mock record to get a header / label row formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
"""Given 2D Table of values, spit out "melted" long-relational form to feed into antibiogramData.js""" import sys, os from medinfo.common.Const import NULL_STRING from medinfo.common.Util import stdOpen from medinfo.db.ResultsFormatter import TabDictReader, TextResultsFormatter ifs = stdOpen(sys.argv[1]) # Input tab delimited file ofs = stdOpen(sys.argv[2], "w") # "-" for stdout reader = TabDictReader(ifs) formatter = TextResultsFormatter(ofs) for row in reader: bug = row["Bug"] for key in reader.fieldnames: value = row[key] if key != "Bug" and value and value != NULL_STRING: formatter.formatTuple([value, bug, key])
def main(argv=None): timer = time.time() extractor = DataExtractor() # Output file featureMatrixFile = stdOpen("featureMatrix.SepsisICU.encounters.tab.gz", "w") # Final columns to output to patient matrix colNames = list() patientEpisodes = extractor.parsePatientEpisodeFile( stdOpen("patientEpisodes.tab"), colNames) #patientIds = set(columnFromModelList(patientEpisodes, "patient_id")); log.info("Expand to index dates based start and end dates") # But only want one entry per patient patientByIndexTimeById = extractor.generateDateRangeIndexTimes( "edAdmitTime", "dischargeTime", patientEpisodes, colNames, timeInterval=None) log.info("Populate flowsheet summary statistics") flowsheetByNameByPatientId = extractor.parseFlowsheetFile( stdOpen("Flowsheet.tab.gz")) extractor.addFlowsheetFeatures(patientByIndexTimeById, flowsheetByNameByPatientId, FLOWSHEET_NAMES, FLOWSHEET_PRE_TIME_DELTA, FLOWSHEET_POST_TIME_DELTA, colNames) log.info("Populate laboratory result summary statistics") labsByBaseNameByPatientId = extractor.parseLabResultsFile( stdOpen("LabResults.tab.gz")) extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId, LAB_BASE_NAMES, LAB_PRE_TIME_DELTA, LAB_POST_TIME_DELTA, colNames) log.info("Populate IV Fluid accumulation") ivFluidsByPatientId = extractor.parseIVFluidFile( stdOpen("IsotonicIVFluids.tab.gz")) extractor.addIVFluidFeatures(patientByIndexTimeById, ivFluidsByPatientId, IVF_THRESHOLD_VOLUMES, IVF_CHECKPOINT_TIMES, colNames) log.info( "Record presence of items in terms of relative time to each item from index time" ) extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("IVAntibiotic.tab")), patientByIndexTimeById, colNames, "IVAntibiotic") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("BloodCulture.tab")), patientByIndexTimeById, colNames, "BloodCulture") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RespViralPanel.tab")), patientByIndexTimeById, colNames, "RespViralPanel") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")), patientByIndexTimeById, colNames, "AnyICULifeSupport") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")), patientByIndexTimeById, colNames, "AnyDNR") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")), patientByIndexTimeById, colNames, "AnyVasoactive") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")), patientByIndexTimeById, colNames, "AnyCRRT") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")), patientByIndexTimeById, colNames, "AnyVentilator") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")), patientByIndexTimeById, colNames, "ComfortCare") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")), patientByIndexTimeById, colNames, "PalliativeConsult") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Death.tab")), patientByIndexTimeById, colNames, "Death") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Birth.tab")), patientByIndexTimeById, colNames, "Birth") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Male.tab")), patientByIndexTimeById, colNames, "Male") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Female.tab")), patientByIndexTimeById, colNames, "Female") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile( stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceWhiteNonHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")), patientByIndexTimeById, colNames, "RaceAsian") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile( stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceWhiteHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")), patientByIndexTimeById, colNames, "RaceUnknown") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")), patientByIndexTimeById, colNames, "RaceOther") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")), patientByIndexTimeById, colNames, "RaceBlack") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")), patientByIndexTimeById, colNames, "RacePacificIslander") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")), patientByIndexTimeById, colNames, "RaceNativeAmerican") log.info( "Systemically Scan for Charlson comorbidities and Treatment Team categories" ) for filename in os.listdir("."): if filename.startswith(CHARLSON_PREFIX): diseaseName = filename if filename.endswith(".tab"): diseaseName = filename[:-len(".tab")] extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen(filename)), patientByIndexTimeById, colNames, diseaseName) if filename.startswith(TREATMENT_TEAM_PREFIX): teamName = filename if filename.endswith(".tab"): teamName = filename[:-len(".tab")] extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen(filename)), patientByIndexTimeById, colNames, teamName) log.info("Output feature matrix file with row per patient day") formatter = TextResultsFormatter(featureMatrixFile) formatter.formatTuple(colNames) for patientId, patientByIndexTime in patientByIndexTimeById.iteritems(): patientResults = patientByIndexTime.values() formatter.formatResultDicts(patientResults, colNames) timer = time.time() - timer print >> sys.stderr, "%.3f seconds to complete" % timer
def main(argv=None): timer = time.time() extractor = DataExtractor() # Final columns to output to patient matrix colNames = list() patientById = extractor.parsePatientFile(stdOpen("patients.tab"), colNames) log.info("Expand to index dates based start and end dates") patientByIndexTimeById = extractor.generateDateRangeIndexTimes( "firstLifeSupportDate", "lastContiguousDate", list(patientById.values()), colNames) log.info("Populate flowsheet summary statistics") flowsheetByNameByPatientId = extractor.parseFlowsheetFile( stdOpen("Flowsheet.tab.gz")) extractor.addFlowsheetFeatures(patientByIndexTimeById, flowsheetByNameByPatientId, FLOWSHEET_NAMES, FLOWSHEET_PRE_TIME_DELTA, FLOWSHEET_POST_TIME_DELTA, colNames) log.info("Populate laboratory result summary statistics") labsByBaseNameByPatientId = extractor.parseLabResultsFile( stdOpen("LabResults.tab.gz")) extractor.addLabFeatures(patientByIndexTimeById, labsByBaseNameByPatientId, LAB_BASE_NAMES, LAB_PRE_TIME_DELTA, LAB_POST_TIME_DELTA, colNames) log.info( "Record presence of items in terms of relative time to each item from index time" ) extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyICULifeSupport.tab")), patientByIndexTimeById, colNames, "AnyICULifeSupport") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyDNR.tab")), patientByIndexTimeById, colNames, "AnyDNR") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyVasoactive.tab")), patientByIndexTimeById, colNames, "AnyVasoactive") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyCRRT.tab")), patientByIndexTimeById, colNames, "AnyCRRT") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("AnyVentilator.tab")), patientByIndexTimeById, colNames, "AnyVentilator") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("ComfortCare.tab")), patientByIndexTimeById, colNames, "ComfortCare") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("PalliativeConsult.tab")), patientByIndexTimeById, colNames, "PalliativeConsult") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Death.tab")), patientByIndexTimeById, colNames, "Death") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Birth.tab")), patientByIndexTimeById, colNames, "Birth") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Male.tab")), patientByIndexTimeById, colNames, "Male") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("Female.tab")), patientByIndexTimeById, colNames, "Female") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile( stdOpen("RaceWhiteNonHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceWhiteNonHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceAsian.tab")), patientByIndexTimeById, colNames, "RaceAsian") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile( stdOpen("RaceWhiteHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceWhiteHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceHispanicLatino.tab")), patientByIndexTimeById, colNames, "RaceHispanicLatino") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceUnknown.tab")), patientByIndexTimeById, colNames, "RaceUnknown") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceOther.tab")), patientByIndexTimeById, colNames, "RaceOther") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceBlack.tab")), patientByIndexTimeById, colNames, "RaceBlack") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RacePacificIslander.tab")), patientByIndexTimeById, colNames, "RacePacificIslander") extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen("RaceNativeAmerican.tab")), patientByIndexTimeById, colNames, "RaceNativeAmerican") log.info( "Systemically Scan for Charlson comorbidities and Treatment Team categories" ) for filename in os.listdir("."): if filename.startswith(CHARLSON_PREFIX): diseaseName = filename if filename.endswith(".tab"): diseaseName = filename[:-len(".tab")] extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen(filename)), patientByIndexTimeById, colNames, diseaseName) if filename.startswith(TREATMENT_TEAM_PREFIX): teamName = filename if filename.endswith(".tab"): teamName = filename[:-len(".tab")] extractor.addClinicalItemFeatures( extractor.parseClinicalItemFile(stdOpen(filename)), patientByIndexTimeById, colNames, teamName) log.info("Output feature matrix file with row per patient day") featureMatrixFile = stdOpen("featureMatrix.ICUDNR.tab.gz", "w") formatter = TextResultsFormatter(featureMatrixFile) for patientId, patientByIndexTime in patientByIndexTimeById.items(): patientResults = list(patientByIndexTime.values()) formatter.formatResultDicts(patientResults, colNames, addHeaderRow=True) timer = time.time() - timer print("%.3f seconds to complete" % timer, file=sys.stderr)