def patientItemCollectionLinkFromSourceItem(self, sourceItem, collectionItem, patientItem, conn): hash_key = hash('{}{}'.format( patientItem["patient_item_id"], collectionItem["item_collection_item_id"])) if hash_key in self.patient_item_collection_links: return # Produce a patient_item_collection_link record model for the given sourceItem patientItemCollectionLink = RowItemModel({ "patient_item_id": patientItem["patient_item_id"], "item_collection_item_id": collectionItem["item_collection_item_id"], }) insertQuery = DBUtil.buildInsertQuery( "patient_item_collection_link", list(patientItemCollectionLink.keys())) insertParams = list(patientItemCollectionLink.values()) try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) self.patient_item_collection_links.add(hash_key) except conn.IntegrityError as err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.warn(err) self.patient_item_collection_links.add(hash_key)
def patientItemModelFromSourceItem(self, sourceItem, clinicalItem, conn): # Produce a patient_item record model for the given sourceItem patient_item = RowItemModel({ "external_id": None, "patient_id": int(sourceItem["rit_uid"][2:], 16), "encounter_id": None, "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": str(sourceItem["itemDate"] ), # without str(), the time is being converted in postgres "item_date_utc": None, # it's a date - so, no need to have a duplicate here }) insert_query = DBUtil.buildInsertQuery("patient_item", patient_item.keys()) insert_params = patient_item.values() try: # Optimistic insert of a new unique item DBUtil.execute(insert_query, insert_params, conn=conn) except conn.IntegrityError, err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.warn(err)
def test_OrderSetUsageAnalysis_numRecsByOrderSet(self): # Only query / recommend one triggered order set at a time, instead of all that occur during the verify time period colNames = [ "patient_id", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall", "weightPrecision", "weightF1-score", "numUsedOrderSets", "numUsedOrderSetItems", "numAvailableOrderSetItems", "numRecommendableUsedOrderSetItems", "numRecommendableAvailableOrderSetItems", "numRecommendableQueryVerifyItems" ] expectedResults = \ [ RowItemModel([-123, 5,4,1, 0.5555,0.833,0.6666,0.8026,0.9708,0.8787, 1, 2, 6, 2, 6,10], colNames ), # Verify Items (1,2,3,4,8,9,11,12,13). Order Set 1 (1,2, 10, 11,12,13). RowItemModel([-456, 1,1,6, 0.5, 0.143,0.2222,0.2, 0.0727,0.1067, 1, 1, 8, 1, 7, 5], colNames ), # Verify Items (12,8). Order Set 2 (2,3,5,6,7, 4,8, 77) (Order Set includes non-recommendable item 77. Count for usage rate, but not for recommender accuracy calculations) RowItemModel([-321, 5,4,1, 0.5555,0.833,0.6666,0.8026,0.9708,0.8787, 1, 2, 6, 2, 6,10], colNames ), # Verify Items (1,2,3,4,8,9,11,12,13). Order Set 1 (1,2, 10, 11,12,13). RowItemModel([-321, 2,2,2, 0.5, 0.5, 0.5, 0.4029,0.5510,0.4655, 1, 1, 4, 1, 4,13], colNames ), # Verify Items (12,14,15,16). Order Set 3 (5,15,6, 14,) ] # Analysis via prepared validation data file sys.stdin = StringIO(self.orderSetValidationFileStr) # Read prepared data file from redirected stdin sys.stdout = StringIO() argv = ["OrderSetUsageAnalysis.py", "--numRecsByOrderSet", '-', "-"] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) #print >> sys.stderr, sys.stdout.getvalue(); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames)
def verifyVirtualItemLinked(self, itemIdSequence, virtualItemId, linkedItemIdsByBaseId, conn=None): """Verify links exist from the virtualItemId to those in the itemIdSequence. If not, then create them in the database and in memory """ extConn = conn is not None if not extConn: conn = self.connFactory.connection() try: if virtualItemId not in linkedItemIdsByBaseId: linkedItemIdsByBaseId[virtualItemId] = set() for componentId in itemIdSequence: if componentId not in linkedItemIdsByBaseId[virtualItemId]: linkModel = RowItemModel() linkModel["clinical_item_id"] = virtualItemId linkModel["linked_item_id"] = componentId insertQuery = DBUtil.buildInsertQuery( "clinical_item_link", list(linkModel.keys())) insertParams = list(linkModel.values()) DBUtil.execute(insertQuery, insertParams, conn=conn) linkedItemIdsByBaseId[virtualItemId].add(componentId) finally: if not extConn: conn.close()
def patientItemFromSourceItem(self, sourceItem, clinicalItem, conn): # Produce a patient_item record model for the given sourceItem patientItem = \ RowItemModel \ ( { "external_id": sourceItem["order_med_id"], "patient_id": sourceItem["pat_id"], "encounter_id": sourceItem["pat_enc_csn_id"], "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": sourceItem["ordering_date"], } ) insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) patientItem["patient_item_id"] = DBUtil.execute( DBUtil.identityQuery("patient_item"), conn=conn)[0][0] except IntegrityError, err: # If turns out to be a duplicate, okay, pull out existint ID and continue to insert whatever else is possible log.info(err) # Lookup just by the composite key components to avoid attempting duplicate insertion again searchPatientItem = \ { "patient_id": patientItem["patient_id"], "clinical_item_id": patientItem["clinical_item_id"], "item_date": patientItem["item_date"], } (patientItem["patient_item_id"], isNew) = DBUtil.findOrInsertItem("patient_item", searchPatientItem, conn=conn)
def test_recommender_stats_commandline(self): # Run the recommender against the mock test data above and verify expected stats calculations log.debug("Query with single item not perturbed by others."); headers = ["clinical_item_id","N","nB","nA","nAB","conditionalFreq","baselineFreq","freqRatio","P-Fisher"]; expectedData = \ [ RowItemModel( [-2, SIMULATED_PATIENT_COUNT, 30.0, 70.0, 7.0, 0.1, 0.0100, 10.0, 3.7e-06], headers ), RowItemModel( [-4, SIMULATED_PATIENT_COUNT, 40.0, 70.0, 20.0, 0.286, 0.0133, 21.42857, 1.2e-23], headers ), ]; sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["ItemRecommender.py","maxRecommendedId=0&queryItemIds=-6&countPrefix=patient_&resultCount=3&sortField=P-Fisher","-"]; self.recommender.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualRecommendedDataStatsTextOutput( expectedData, textOutput, headers ); log.debug("Query for non-unique counts."); headers = ["clinical_item_id","N","nB","nA","nAB","conditionalFreq","baselineFreq","freqRatio","oddsRatio"]; expectedData = \ [ RowItemModel( [-4, SIMULATED_PATIENT_COUNT, 40.0, 70.0, 25.0, 0.35714, 0.01333, 26.7857, 107.96296], headers ), RowItemModel( [-2, SIMULATED_PATIENT_COUNT, 30.0, 70.0, 12.0, 0.1714, 0.01, 17.1429, 33.47126], headers ), ]; sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["ItemRecommender.py","maxRecommendedId=0&queryItemIds=-6&countPrefix=&resultCount=3&sortField=oddsRatio","-"]; self.recommender.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualRecommendedDataStatsTextOutput( expectedData, textOutput, headers );
def generatePatientItemsForCompositeId(self, clinicalItemIds, compositeId, conn=None): """Create patient_item records for the composite to match the given clinical item ID patient items. """ extConn = True if conn is None: conn = self.connFactory.connection() extConn = False try: # Record linking information for componentId in clinicalItemIds: linkModel = RowItemModel() linkModel["clinical_item_id"] = compositeId linkModel["linked_item_id"] = componentId insertQuery = DBUtil.buildInsertQuery("clinical_item_link", linkModel.keys()) insertParams = linkModel.values() DBUtil.execute(insertQuery, insertParams, conn=conn) # Extract back link information, which will also flatten out any potential inherited links linkedItemIdsByBaseId = self.loadLinkedItemIdsByBaseId(conn=conn) linkedItemIds = linkedItemIdsByBaseId[compositeId] # Create patienItem records for the composite clinical item to overlap existing component ones # First query for the existing component records query = SQLQuery() query.addSelect("*") query.addFrom("patient_item") query.addWhereIn("clinical_item_id", linkedItemIds) results = DBUtil.execute(query, includeColumnNames=True, conn=conn) patientItems = modelListFromTable(results) # Patch component records to instead become composite item records then insert back into database progress = ProgressDots(total=len(patientItems)) for patientItem in patientItems: del patientItem["patient_item_id"] patientItem["clinical_item_id"] = compositeId patientItem["analyze_date"] = None insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) except conn.IntegrityError, err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.info(err) progress.Update() # progress.PrintStatus(); finally: if not extConn: conn.close()
def compositeRelated(self, clinicalItemIds, itemName, itemDescription, categoryId, compositeId=None, conn=None): """A new clinical item will be created, with patient item records created to match every one currently matching one of the specified clinical items. Parameters specify new composite item name/code, description, and clinical item category to be created under. Option to explicitly specify the composite clinical item Id value rather than taking a sequence number value (convenient for test cases) Returns ID of newly created item Depending on context, may wish to deactivateAnalysis of component items once this composite one is created if they are no longer of interest. Newly created composite item's default_recommend attribute will be reset to 0 since it presumably does not represent a discrete order item. Linking records will be created in clinical_item_link between the composite and and component clinical items so that these relationships can be reconstructed Examples this could be relevant for: ICUVasopressors to include all vasopressor infusions (dopamine, norepinephrine, epinephrine, vasopressin, etc.) All blood transfusion indexes, G vs J vs Feeding tube equivalent, Ear, Eyes med routes irrelevant which ear/eye. Eventually lump together medication classes (e.g., any "PPI" same difference as choosing pantoprazole or omeprazole. Eventually lump together diagnosis codes by major prefix to reduce granularity and improve general signal. """ extConn = True if conn is None: conn = self.connFactory.connection() extConn = False try: # Build new composite item compositeItem = RowItemModel() compositeItem["name"] = itemName compositeItem["description"] = itemDescription compositeItem["clinical_item_category_id"] = categoryId compositeItem["default_recommend"] = 0 if compositeId is not None: compositeItem["clinical_item_id"] = compositeId insertQuery = DBUtil.buildInsertQuery("clinical_item", compositeItem.keys()) insertParams = compositeItem.values() DBUtil.execute(insertQuery, insertParams, conn=conn) if compositeId is None: compositeId = DBUtil.execute( DBUtil.identityQuery("clinical_item"), conn=conn)[0][0] # Retrieve the just inserted item's ID self.generatePatientItemsForCompositeId(clinicalItemIds, compositeId, conn=conn) return compositeId finally: if not extConn: conn.close()
def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self) log.info( "Populate the database with test data (Assumes MySQL data structure)" ) DBUtil.execute \ ("""create table %s ( USER_ID varchar(255), USER_NAME varchar(255), DE_PAT_ID bigint, ACCESS_DATETIME datetime, METRIC_ID integer, METRIC_NAME text, LINE_COUNT integer, DESCRIPTION text, METRIC_GROUP_NUM integer, METRIC_GROUP_NAME text ); """ % TEST_SOURCE_TABLE ) self.testUserIDs = list() headers = [ "user_id", "user_name", "de_pat_id", "access_datetime", "metric_id", "metric_name", "line_count", "description", "metric_group_num", "metric_group_name" ] dataModels = \ [ RowItemModel( ['S-7', 'CJ', None, '2013-10-14 08:44:47', '33006', 'ME_IBGLANCE', '1', 'IN BASKET GLANCE PLUGIN ACCESSED IN RADAR', '33000', 'Radar'], headers ), RowItemModel( ['S-7', 'CJ', '3289034', '2014-03-20 00:40:18', '34127', 'IP_ORDERSSECTION', '1', 'Inpatient Orders section opened ', '17001', 'PATIENT CLINICAL INFO'], headers ), RowItemModel( ['S-7', 'CJ', None, '2014-01-01 10:10:56', '20008', 'AC_IB_CREATEMSG', '1', 'In Basket message of any type created.', '20000', 'In Basket Report'], headers ), RowItemModel( ['S-7', 'CJ', None, '2014-01-01 10:10:56', '20008', 'AC_IB_CREATEMSG', '2', '(Created messages counted.) ', '20000', 'In Basket Report'], headers ), RowItemModel( ['S-7', 'CJ', '1853397', '2013-06-29 11:25:02', '20075', 'AC_DOCUMENTLIST_SUBC', '1', 'Prelude Documents list accessed for patient.', None, None], headers ), RowItemModel( ['S-4', 'AB', '3133593', '2013-10-22 06:46:29', '17008', 'MR_REPORTS', '1', 'A report with patient data accessed.', '17001', 'PATIENT CLINICAL INFO'], headers ), RowItemModel( ['S-4', 'AB', '3047429', '2014-03-16 20:56:54', '17016', 'MR_RESULTS_REVIEW', '1', 'Results Review activity accessed.', '17002', 'Patient Chart Review'], headers ), RowItemModel( ['S-4', 'AB', '3408732', '2014-04-08 08:47:38', '17016', 'MR_RESULTS_REVIEW', '1', 'Results Review activity accessed.', '17002', 'Patient Chart Review'], headers ), RowItemModel( ['S-4', 'AB', None, '2014-02-26 19:27:48', '34140', 'IP_SYSTEM_LIST', '1', 'Inpatient system list accessed.', '20001', 'PATIENT DEMOGRAPHICS'], headers ), RowItemModel( ['S-4', 'AB', '2487184', '2013-10-11 08:45:46', '17008', 'MR_REPORTS', '1', 'A report with patient data accessed.', '17001', 'PATIENT CLINICAL INFO'], headers ), ] for dataModel in dataModels: (dataItemId, isNew) = DBUtil.findOrInsertItem(TEST_SOURCE_TABLE, dataModel, retrieveCol="user_id") userID = int(dataItemId[1:]) # Trim leading S and parse remainder as an integer self.testUserIDs.append(userID) self.converter = ProviderRotationConversion() # Instance to test on self.converter.sourceTableName = TEST_SOURCE_TABLE
def test_recommender_aggregation(self): # Test different scoring aggregation methods query = RecommenderQuery() query.countPrefix = "patient_" query.queryItemIds = set([-2, -5]) #query.excludeItemIds = set(); #query.categoryIds = set(); #query.timeDeltaMax = None; # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent. If left blank, will just consider all items within a given patient as co-occurrent. query.limit = 3 # Just get top 3 ranks for simplicity query.maxRecommendedId = 0 # Artificial constraint to focus only on test data headers = ["clinical_item_id", "conditionalFreq", "freqRatio"] # Default weighted aggregation method expectedData = \ [ RowItemModel( [-4, 0.3, 22.5], headers ), RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Change to unweighted aggregation method query.aggregationMethod = "unweighted" expectedData = \ [ RowItemModel( [-4, 0.32857, 24.64286], headers ), RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Change to Serial Bayes aggregation method query.aggregationMethod = "SerialBayes" expectedData = \ [ RowItemModel( [-4, 0.89157, 66.867471], headers ), RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Naive Bayes aggregation query.aggregationMethod = "NaiveBayes" expectedData = \ [ RowItemModel( [-4, 3.75, 281.25], headers ), # Without truncating negative values #RowItemModel( [-4, 0.8, 58.59707], headers ), # With truncating negative values RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Apply value filter query.fieldFilters["freqRatio>"] = 10.0 expectedData = \ [ RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query)
def patientItemFromSourceItem(self, sourceItem, clinicalItem, conn): # some prov_map_id values are NULL in starr_datalake2018 if sourceItem["prov_map_id"] is not None: # prov_map_id starts with letters, we're interested only in number parts external_id = int( re.sub("[A-Z]+(\\d+)", "\\1", sourceItem["prov_map_id"]), 16) else: external_id = None # Produce a patient_item record model for the given sourceItem patientItem = RowItemModel({ "external_id": external_id, "patient_id": int(sourceItem["rit_uid"][2:], 16), "encounter_id": sourceItem["pat_enc_csn_id_coded"], "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": str(sourceItem["trtmnt_tm_begin_dt_jittered"] ), # without str(), the time is being converted in postgres "item_date_utc": str(sourceItem["trtmnt_tm_begin_dt_jittered_utc"] ) # without str(), the time is being converted in postgres }) insertQuery = DBUtil.buildInsertQuery("patient_item", list(patientItem.keys())) insertParams = list(patientItem.values()) try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) # Retrieve id of just inserted row patientItem["patient_item_id"] = DBUtil.execute( DBUtil.identityQuery("patient_item"), conn=conn)[0][0] except conn.IntegrityError as err: # If turns out to be a duplicate, okay, pull out existing ID and continue to insert whatever else is possible log.info( err ) # Lookup just by the composite key components to avoid attempting duplicate insertion again searchPatientItem = { "patient_id": patientItem["patient_id"], "clinical_item_id": patientItem["clinical_item_id"], "item_date": patientItem["item_date"], } (patientItem["patient_item_id"], isNew) = DBUtil.findOrInsertItem("patient_item", searchPatientItem, conn=conn) return patientItem
def test_calibrationAnalysis(self): # Simulate command_line inputFileStr = \ """# Comment header to ignore outcome\tscore 0\t0.01 0\t0.02 0\t0.03 0\t0.04 0\t0.05 0\t0.10 0\t0.11 1\t0.12 1\t0.13 0\t0.15 0\t0.21 0\t0.22 1\t0.22 0\t0.23 1\t0.23 1\t0.31 1\t0.32 1\t0.33 1\t0.33 1\t0.34 1\t0.35 """ expectedHLP = 0.000218 colNames = [ "scoreMin", "scoreMax", "totalInstances", "observedOutcomes", "predictedOutcomes", "observedRate", "predictedRate" ] expectedResults = \ [ RowItemModel([0.01, 0.05, 5, 0, 0.15, 0.0, 0.030], colNames ), RowItemModel([0.10, 0.15, 5, 2, 0.61, 0.4, 0.122], colNames ), RowItemModel([0.21, 0.23, 5, 2, 1.11, 0.4, 0.222], colNames ), RowItemModel([0.31, 0.35, 6, 6, 1.98, 1.0, 0.330], colNames ), ] sys.stdin = StringIO(inputFileStr) # Simulate stdin input sys.stdout = StringIO() # Redirect stdout output to collect test results argv = ["CalibrationPlot.py", "-b", "4", "-", "-"] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) jsonData = self.extractJSONComment(textOutput) self.assertAlmostEqual(expectedHLP, jsonData["P-HosmerLemeshow"], 5) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames)
def patientItemFromSourceItem(self, sourceItem, clinicalItem, conn): # Produce a patient_item record model for the given sourceItem patientItem = RowItemModel({ "external_id": sourceItem["order_proc_id_coded"], "patient_id": int(sourceItem["jc_uid"][2:], 16), "encounter_id": sourceItem["pat_enc_csn_id_coded"], "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": sourceItem["order_time_jittered"], "item_date_utc": str(sourceItem["order_time_jittered_utc"] ), # without str(), the time is being converted in postgres }) key_hash = hash('{}{}{}'.format(patientItem["patient_id"], patientItem["clinical_item_id"], patientItem["item_date"])) if key_hash in self.patient_items: return self.patient_items[key_hash] insertQuery = DBUtil.buildInsertQuery("patient_item", list(patientItem.keys())) insertParams = list(patientItem.values()) try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) patientItem["patient_item_id"] = DBUtil.execute( DBUtil.identityQuery("patient_item"), conn=conn)[0][0] self.patient_items[key_hash] = patientItem except conn.IntegrityError as err: # If turns out to be a duplicate, okay, pull out existint ID and continue to insert whatever else is possible log.warn( err ) # Lookup just by the composite key components to avoid attempting duplicate insertion again searchPatientItem = { "patient_id": patientItem["patient_id"], "clinical_item_id": patientItem["clinical_item_id"], "item_date": patientItem["item_date"], } (patientItem["patient_item_id"], isNew) = DBUtil.findOrInsertItem("patient_item", searchPatientItem, conn=conn) self.patient_items[key_hash] = patientItem return patientItem
def clinicalItemFromSourceItem(self, sourceItem, category, conn): # Load or produce a clinical_item record model for the given sourceItem clinicalItemKey = (category["clinical_item_category_id"], sourceItem["proc_code"]) # This should be what determines a new unique clinical_item. # Some debate about whether to distinguish by proc_id or proc_code, but there are many labs and other procs # that use different proc_ids even though they are obviously the same. Go link in STRIDE_ORDER_PROC for examples like LABA1C. # The self.clinicalItemByCategoryIdExtId is supposed to keep track of which clinical_items we're already aware of, # but not that it starts blank when this module runs. # So you in theory should only run this conversion process on a database once # (otherwise it will not be aware that a bunch of duplicate clinical_items already exist in the database). # Alternatively, this module should be updated, so that it initializes this key tracker with whatever is already in the database. if clinicalItemKey not in self.clinicalItemByCategoryIdExtId: # Clinical Item does not yet exist in the local cache. Check if in database table (if not, persist a new record) clinicalItem = RowItemModel({ "clinical_item_category_id": category["clinical_item_category_id"], "external_id": sourceItem["proc_id"], "name": sourceItem["proc_code"], "description": sourceItem["description"], }) (clinicalItemId, isNew) = DBUtil.findOrInsertItem("clinical_item", clinicalItem, conn=conn) clinicalItem["clinical_item_id"] = clinicalItemId self.clinicalItemByCategoryIdExtId[clinicalItemKey] = clinicalItem return self.clinicalItemByCategoryIdExtId[clinicalItemKey]
def assertEqualStatResultsTextOutput(self, expectedResults, textOutput, colNames): # Convert the text output into a structured format to facilitate verification testing headerLine = None; while headerLine is None: nextLine = textOutput.readline(); if not nextLine.startswith(COMMENT_TAG): headerLine = nextLine; headers = headerLine.strip().split("\t"); analysisResults = list(); for line in textOutput: dataChunks = line.strip().split("\t"); resultModel = RowItemModel( dataChunks, headers ); # Convert the target elements of interest into numerical values for col in colNames: if resultModel[col] == NULL_STRING: resultModel[col] = None; else: try: resultModel[col] = float(resultModel[col]); except ValueError: pass; # Not a number, just leave it as original value then analysisResults.append(resultModel); self.assertEqualStatResults( expectedResults, analysisResults, colNames );
def patientItemCollectionLinkFromSourceItem(self, sourceItem, collectionItem, patientItem, conn): # Produce a patient_item_collection_link record model for the given sourceItem patientItemCollectionLink = \ RowItemModel \ ( { "patient_item_id": patientItem["patient_item_id"], "item_collection_item_id": collectionItem["item_collection_item_id"], } ); insertQuery = DBUtil.buildInsertQuery("patient_item_collection_link", patientItemCollectionLink.keys() ); insertParams= patientItemCollectionLink.values(); try: # Optimistic insert of a new unique item DBUtil.execute( insertQuery, insertParams, conn=conn ); except conn.IntegrityError, err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.info(err);
def test_numRecsByOrderSet(self): # Designate number of recommendations indirectly via linked order set id DBUtil.execute("update clinical_item set default_recommend = 0 where clinical_item_id = -8"); # Disable default recommend on one item to shift results colNames = ["patient_id", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"]; expectedResults = [ RowItemModel([-11111, 2, 0, 3, 1.0, 0.4, 0.571, 1.0, 0.3178, 0.4167], colNames ) ]; # Do through fabricated prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); # Artificially add a key order set ID for the fabricated data modFile = StringIO(); formatter = TextResultsFormatter(modFile); dataCols = None; for i, dataRow in enumerate(TabDictReader(preparedDataFile)): dataRow["order_set_id"] = TEST_ORDERSET_ID; if i <= 0: dataCols = list(dataRow.keys()); formatter.formatTuple(dataCols); # Insert a mock record to get a header / label row formatter.formatResultDict(dataRow, dataCols); preparedDataFile = StringIO(modFile.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); #argv = ["RecommendationClassificationAnalysis.py","-P","-r","5","-m","0","-R","ItemAssociationRecommender",'-',"-"]; argv = ["RecommendationClassificationAnalysis.py","-P","--numRecsByOrderSet","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
def test_tripleSequence_virtualItem(self): # Test outcome assessment when the target is a virtual item based on the presence of a triple (instead of double) sequence of items # Run the recommender against the mock test data above and verify expected stats afterwards. analysisQuery = AnalysisQuery() analysisQuery.patientIds = set([-22222]) analysisQuery.baseCategoryId = -7 analysisQuery.queryTimeSpan = timedelta(0, 86400) analysisQuery.sequenceItemIdsByVirtualItemId[-16] = (-15, -14) #analysisQuery.recommender = BaselineFrequencyRecommender(); analysisQuery.recommender = ItemAssociationRecommender() analysisQuery.baseRecQuery = RecommenderQuery() analysisQuery.baseRecQuery.targetItemIds = set([-16]) analysisQuery.baseRecQuery.maxRecommendedId = 0 # Restrict to test data # Initial run without time limits on outcome measure colNames = ["patient_id", "outcome.-16", "score.-16"] expectedResults = [RowItemModel([-22222, +1, 0.14286], colNames)] analysisResults = self.analyzer(analysisQuery) self.assertEqualStatResults(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "OutcomePredictionAnalysis.py", "-c", "-7", "-Q", "86400", "-o", "-16=-15:-14", "-m", "0", "-R", "ItemAssociationRecommender", '0,-22222', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames)
def itemCollectionFromSourceItem(self, sourceItem, conn): # Load or produce an item_collection record model for the given sourceItem if sourceItem["protocol_id"] is None: # No order set link to this item, so nothing to return return None key = { "protocol_id": sourceItem["protocol_id"], "ss_section_id": sourceItem["ss_section_id"], "ss_sg_key": sourceItem["ss_sg_key"].strip().upper() if sourceItem["ss_sg_key"] is not None else None } collection_key = "%(protocol_id)d-%(ss_section_id)s-%(ss_sg_key)s" % key if collection_key not in self.itemCollectionByKeyStr: # Collection does not yet exist in the local cache. Check if in database table (if not, persist a new record) collection = RowItemModel({ "external_id": sourceItem["protocol_id"], "name": sourceItem["protocol_name"], "section": sourceItem["ss_section_name"], "subgroup": sourceItem["ss_sg_name"], }) (collectionId, isNew) = DBUtil.findOrInsertItem("item_collection", collection, conn=conn) collection["item_collection_id"] = collectionId self.itemCollectionByKeyStr[collection_key] = collection return self.itemCollectionByKeyStr[collection_key]
def test_documentParse(self): # Run the data conversion on the test data and look for expected answers headers = [ "note_id", "pat_mrn_id", "contact_date", "FollowupPhone", "TeamPager", "DietOrders", "PrimaryDx", "FollowupSchedule", "NewRx", "StopRx", "AddnInstr" ] expectedDataTable = \ [ ["500001", "100001", "01-JUL-14", 1, 1, 0, 1, 1, 0, 0, 7], ["500002", "100002", "05-AUG-14", 0, 1, 3, 0, 0, 0, 1, 7], ["500003", "100003", "03-SEP-13", 3, 0, 3, 0, 1, 6, 1, 54], ] expectedData = list() for row in expectedDataTable: expectedData.append(RowItemModel(row, headers)) sourceFile = StringIO(self.testFileStr) outputFile = StringIO() # Save but discard this, won't inspect detailed contents actualData = self.parser(sourceFile, outputFile) """ for summaryRecord in actualData: for header in headers: print >> sys.stderr, summaryRecord[header], print >> sys.stderr; print >> sys.stderr, summaryRecord["docModel"]["StopRx"] """ self.assertEqualDictList(expectedData, actualData, headers)
def assertEqualTextOutput(self, expectedResults, textOutput, colNames): """Convenience function to verify text output from a program to match the provided symbolic expected results, by parsing the text into structured fields.""" headerLine = None while headerLine is None: nextLine = textOutput.readline() if not nextLine.startswith(COMMENT_TAG): headerLine = nextLine headers = headerLine.strip().split("\t") analysisResults = list() for line in textOutput: dataChunks = line.strip().split("\t") resultModel = RowItemModel(dataChunks, headers) # Convert the target elements of interest into numerical values for col in colNames: if col not in resultModel: # Look for a JSON encoded version jsonCol = "%sJSON" % col if jsonCol in resultModel: resultModel[col] = loadJSONDict( resultModel[jsonCol], int, int) else: resultModel[col] = int(resultModel[col]) analysisResults.append(resultModel) #for col in colNames: # print >> sys.stderr, col, expectedResults[0][col], analysisResults[0][col] self.assertEqualStatResults(expectedResults, analysisResults, colNames)
def querySourceItems(self, convOptions, progress=None, conn=None): """Query the database for list of all source clinical items (culture results, etc.) and yield the results one at a time. If startDate provided, only return items whose occurence date is on or after that date. """ extConn = conn is not None; if not extConn: conn = self.connFactory.connection(); # Column headers to query for that map to respective fields in analysis table headers = ["order_proc_anon_id","pat_anon_id","pat_enc_csn_anon_id","proc_code","organism_name","antibiotic_name","suseptibility", "shifted_result_time"]; query = SQLQuery(); for header in headers: query.addSelect( header ); query.addFrom("stride_culture_micro"); # TODO: FIGURE OUT WHY CAN"T DO >= OPERATION HERE # if convOptions.startDate is not None: # query.addWhereOp("shifted_result_time",">=", convOptions.startDate); # if convOptions.endDate is not None: # query.addWhereOp("shifted_result_time","<", convOptions.endDate); # Still use begin date as common filter value # Query to get an estimate of how long the process will be if progress is not None: progress.total = DBUtil.execute(query.totalQuery(), conn=conn)[0][0]; cursor = conn.cursor(); # Do one massive query, but yield data for one item at a time. cursor.execute( str(query), tuple(query.params) ); row = cursor.fetchone(); while row is not None: rowModel = RowItemModel( row, headers ); if rowModel['shifted_result_time'] is None: # Don't add if no result time given row = cursor.fetchone(); continue if rowModel['organism_name'] is not None: # if positive culture but results uninterpretable, don't add feature if rowModel['suseptibility'] is None or rowModel['antibiotic_name'] == 'Method' or rowModel['antibiotic_name'] is None: row = cursor.fetchone(); continue # So that we don't run into directory issues later when writing temp files try: rowModel['antibiotic_name'] = rowModel['antibiotic_name'].replace('/', '-') except: # When antibiotic name is none pass yield rowModel; # Yield one row worth of data at a time to avoid having to keep the whole result set in memory row = cursor.fetchone(); # Slight risk here. Normally DB connection closing should be in finally of a try block, # but using the "yield" generator construct forbids us from using a try, finally construct. cursor.close(); if not extConn: conn.close();
def accessLogFromSourceItem(self, sourceItem, user, metric, metricLine, conn): # Produce an access log for the given sourceItem with links to the lookup user and metric # Only record once for multi-line descriptions, so check the line number accessLog = None; if metricLine == 1: accessLog = \ RowItemModel \ ( { "user_id": user["user_id"], "de_pat_id": sourceItem["de_pat_id"], "metric_id": metric["metric_id"], "access_datetime": sourceItem["access_datetime"], } ); insertQuery = DBUtil.buildInsertQuery("access_log", accessLog.keys() ); insertParams= accessLog.values(); DBUtil.execute( insertQuery, insertParams, conn=conn ); return accessLog;
def querySourceItems(self, startDate=None, endDate=None, progress=None, conn=None): """Query the database for list of all source clinical items (lab results in this case) and yield the results one at a time. If startDate provided, only return items whose result_time is on or after that date. Only include results records where the result_flag is set to an informative value, to focus only on abnormal lab results (including would be a ton more relatively uninformative data that would greatly expend data space and subsequent computation time) """ extConn = conn is not None; if not extConn: conn = self.connFactory.connection(); # Column headers to query for that map to respective fields in analysis table headers = ["sor.order_proc_id", "pat_id", "pat_enc_csn_id", "order_type", "proc_id", "proc_code", "base_name", "component_name", "common_name", "ord_num_value", "result_flag", "result_in_range_yn", "sor.result_time"]; query = SQLQuery(); for header in headers: query.addSelect( header ); query.addFrom("stride_order_proc as sop"); query.addFrom("%s as sor" % SOURCE_TABLE); query.addWhere("sop.order_proc_id = sor.order_proc_id"); #query.addWhere("result_flag <> '*'"); # Will exclude nulls and the uninformative '*' values for text-based microbiology results if startDate is not None: query.addWhereOp("sor.result_time",">=", startDate); if endDate is not None: query.addWhereOp("sor.result_time","<", endDate); # Query to get an estimate of how long the process will be if progress is not None: progress.total = DBUtil.execute(query.totalQuery(), conn=conn)[0][0]; cursor = conn.cursor(); # Do one massive query, but yield data for one item at a time. cursor.execute( str(query), tuple(query.params) ); row = cursor.fetchone(); while row is not None: rowModel = RowItemModel( row, headers ); # Normalize qualified labels rowModel["order_proc_id"] = rowModel["sor.order_proc_id"]; rowModel["result_time"] = rowModel["sor.result_time"]; if rowModel['base_name'] is None: row = cursor.fetchone() continue self.populateResultFlag(rowModel,conn=conn); yield rowModel; # Yield one row worth of data at a time to avoid having to keep the whole result set in memory row = cursor.fetchone(); # Slight risk here. Normally DB connection closing should be in finally of a try block, # but using the "yield" generator construct forbids us from using a try, finally construct. cursor.close(); if not extConn: conn.close();
def test_recommender_stats(self): # Run the recommender against the mock test data above and verify expected stats calculations query = RecommenderQuery() query.parseParams \ ( { "countPrefix": "patient_", "queryItemIds": "-6", "resultCount": "3", # Just get top 3 ranks for simplicity "maxRecommendedId": "0", # Artificial constraint to focus only on test data "sortField": "P-Fisher", # Specifically request derived expected vs. observed stats } ) log.debug("Query with single item not perturbed by others.") headers = [ "clinical_item_id", "N", "nB", "nA", "nAB", "conditionalFreq", "baselineFreq", "freqRatio", "P-Fisher" ] expectedData = \ [ RowItemModel( [-2, SIMULATED_PATIENT_COUNT, 30.0, 70.0, 7.0, 0.1, 0.0100, 10.0, 3.7e-06], headers ), RowItemModel( [-4, SIMULATED_PATIENT_COUNT, 40.0, 70.0, 20.0, 0.286, 0.0133, 21.42857, 1.2e-23], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedDataStats(expectedData, recommendedData, headers) log.debug("Query for non-unique counts.") query.parseParams \ ( { "countPrefix": "", "sortField": "oddsRatio", } ) headers = [ "clinical_item_id", "N", "nB", "nA", "nAB", "conditionalFreq", "baselineFreq", "freqRatio", "oddsRatio" ] expectedData = \ [ RowItemModel( [-4, SIMULATED_PATIENT_COUNT, 40.0, 70.0, 25.0, 0.35714, 0.01333, 26.7857, 107.96296], headers ), RowItemModel( [-2, SIMULATED_PATIENT_COUNT, 30.0, 70.0, 12.0, 0.1714, 0.01, 17.1429, 33.47126], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedDataStats(expectedData, recommendedData, headers)
def patientItemModelFromSourceItem(self, sourceItem, clinicalItem, conn): # Produce a patient_item record model for the given sourceItem patientItem = \ RowItemModel \ ({"external_id": None, "patient_id": int(sourceItem["rit_uid"][2:], 16), "encounter_id": None, "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": sourceItem["itemDate"], } ) insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) except conn.IntegrityError, err: # If turns out to be a duplicate, okay, just note it and continue to insert whatever else is possible log.info(err)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <trainFile> <testFile> [<outputFile>]\n"+\ " <trainFile> Tab-delimited file, queryItemIdsJSON expected to be parseable into lists of query items as well as an outcome.X column\n"+\ " <testFile> Same structure as trainFile, but with test cases to assess prediction scoring\n"+\ " <outputFile> Tab-delimited that can be used for ROC analysis with columns for outcome and predicted score\n"+\ "" parser = OptionParser(usage=usageStr) parser.add_option( "-o", "--outcomeItemId", dest="outcomeItemId", help="Outcome item IDs to assess get prediction scores for") (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 2: trainFile = stdOpen(args[0]) testFile = stdOpen(args[1]) outcomeId = int(options.outcomeItemId) # Run the actual analysis (featureMatrix, outcomeMatrix, queryIds, rowModels) = self.fileToMatrixes(trainFile, outcomeId) model = self.train(featureMatrix, outcomeMatrix) analysisResults = self.predict(testFile, model, queryIds, outcomeId) # Format the results for output outputFilename = None if len(args) > 2: outputFilename = args[2] outputFile = stdOpen(outputFilename, "w") # Print comment line with arguments to allow for deconstruction later as well as extra results print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile) colNames = self.analysisHeaders(outcomeId) analysisResults.insert(0, RowItemModel(colNames, colNames)) # Insert a mock record to get a header / label row formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def test_orderSetAnalysisPreparer(self): # Run the analysis preparer against the mock test data above and verify expected data afterwards. # Use existing order set usage as reference points # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "PreparePatientItems.py", "-O", "-c", "-7", "-Q", "86400", "-V", "3600", '0,-55555,-11111', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) colNames = [ "patient_id", "baseItemId", "queryItemCountById", "verifyItemCountById", "order_set_id" ] expectedResults = \ [ RowItemModel([-55555, -25, {-25:1,-2:1}, {-1:1,-9:1,-5:1,-8:1,-11:1,-12:1}, -1], colNames ), # Diagnosis not in verify items RowItemModel([-55555, -25, {-25:1,-2:1,-1:1,-9:1,-5:1}, {-8:1,-11:1,-12:1}, -2], colNames ) # Diagnosis okay to be in query items ] self.assertEqualTextOutput(expectedResults, textOutput, colNames)
def queryPatients(outputFile): log.info("Select patients with any result for a ferritin test") patientById = dict() query = \ """select distinct pat_id from stride_order_results as sor, stride_order_proc as sop where sor.order_proc_id = sop.order_proc_id and base_name = 'ferritin' """ results = DBUtil.execute(query) for (patientId, ) in results: patientId = int(patientId) patientById[patientId] = RowItemModel({"patient_id": patientId}) log.info("Patients with admit or diet orders for surgery") # Not perfectly accurate for isolating surgical patients for patient in patientById.itervalues(): patient["surgery"] = 0 # Default to 0 / false query = \ """select distinct patient_id from patient_item where clinical_item_id in (3614,4177,4220) """ results = DBUtil.execute(query) for (patientId, ) in results: if patientId in patientById: patientById[patientId]["surgery"] = 1 log.info("Patients with an order for dialysis") # (Does not differentiate acute vs. chronic. Includes peritoneal) for patient in patientById.itervalues(): patient["dialysis"] = 0 # Default to 0 / false query = \ """select distinct patient_id from patient_item where clinical_item_id in (1815,3783,4322) """ results = DBUtil.execute(query) for (patientId, ) in results: if patientId in patientById: patientById[patientId]["dialysis"] = 1 # Drop results as tab-delimited text output formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(patientById.itervalues(), addHeaderRow=True) return patientById
def test_copyPatientTemplate(self): # Copy a patient template, including deep copy of notes, orders, states, but only up to relative time zero newPatientData = { "name": "Template Copy" } templatePatientId = -1 self.testPatientId = self.manager.copyPatientTemplate( newPatientData, templatePatientId) futureTime = 1000000 # Far future time to test that we still only copied the results up to time zero # Verify basic patient information patientCols = ["name", "age_years", "gender", "sim_state_id"] patientModel = self.manager.loadPatientInfo([self.testPatientId])[0] expectedPatientModel = RowItemModel( ["Template Copy", 60, "Female", -1], patientCols) self.assertEqualDict(expectedPatientModel, patientModel, patientCols) # Verify notes dataCols = ["sim_patient_id", "content"] sampleData = self.manager.loadNotes(self.testPatientId, futureTime) verifyData = \ [ RowItemModel([self.testPatientId,"Initial Note"], dataCols), RowItemModel([self.testPatientId,"Initial Note"], dataCols), # Second copy because another state initiation at time zero and negative onset time ] self.assertEqualDictList(verifyData, sampleData, dataCols) # Verify orders dataCols = [ "sim_user_id", "sim_patient_id", "sim_state_id", "clinical_item_id", "relative_time_start", "relative_time_end" ] sampleData = self.manager.loadPatientOrders(self.testPatientId, futureTime, loadActive=None) verifyData = \ [ RowItemModel([-1,self.testPatientId,-1,-15,0,None], dataCols), ] self.assertEqualDictList(verifyData, sampleData, dataCols) # Verify states dataCols = [ "sim_patient_id", "sim_state_id", "relative_time_start", "relative_time_end" ] query = SQLQuery() for dataCol in dataCols: query.addSelect(dataCol) query.addFrom("sim_patient_state") query.addWhereEqual("sim_patient_id", self.testPatientId) query.addOrderBy("relative_time_start") sampleDataTable = DBUtil.execute(query, includeColumnNames=True) sampleData = modelListFromTable(sampleDataTable) verifyData = \ [ RowItemModel([self.testPatientId,-1,-7200,0], dataCols), RowItemModel([self.testPatientId,-1,0,None], dataCols), ] self.assertEqualDictList(verifyData, sampleData, dataCols)