def __call__(self, query): # Given query items, lookup existing order sets to find and score related items # Load item lookup information if self.itemsById is None: self.initItemLookups(query) # Adapt query into dictionary format queryItemCountById = query.queryItemIds if not isinstance( queryItemCountById, dict ): # Not a dictionary, probably a one dimensional list/set, then just add counts of 1 itemIds = queryItemCountById queryItemCountById = dict() for itemId in itemIds: queryItemCountById[itemId] = 1 # Primary execution. Apply query to generate scored relationship to each order set. weightByOrderSetId = self.estimateOrderSetWeights( queryItemCountById, self.itemIdsByOrderSetId, self.orderSetIdsByItemId) # Composite scores for (recommendable) items by taking weighted average across the top items for each topic recScoreByItemId = dict() for itemId in self.candidateItemIds: if self.isItemRecommendable(itemId, queryItemCountById, query, self.categoryIdByItemId): recScoreByItemId[itemId] = 0.0 for orderSetId, orderSetWeight in weightByOrderSetId.iteritems(): for itemId in recScoreByItemId.keys(): itemWeight = self.itemOrderSetWeight(itemId, orderSetId, self.itemIdsByOrderSetId) recScoreByItemId[itemId] += orderSetWeight * itemWeight # Build 2-pls with lists to sort by score recommendedData = list() numItemsInAnyOrderSet = len(self.orderSetIdsByItemId) for itemId, totalItemWeight in recScoreByItemId.iteritems(): tfidf = 0.0 if itemId in self.orderSetIdsByItemId: numOrderSetsWithItem = len(self.orderSetIdsByItemId[itemId]) tfidf = totalItemWeight * numItemsInAnyOrderSet / numOrderSetsWithItem # Scale TF*IDF score based on baseline order set counts to prioritize disproportionately common items itemModel = \ { "totalItemWeight": totalItemWeight, "tf": totalItemWeight, "PPV": totalItemWeight, "P(item|query)": totalItemWeight, "P(B|A)": totalItemWeight, "tfidf": tfidf, "lift": tfidf, "interest": tfidf, "P(item|query)/P(item)": tfidf, "P(B|A)/P(B)": tfidf, "clinical_item_id": itemId, "weightByOrderSetId": weightByOrderSetId, "numSelectedOrderSets": len(weightByOrderSetId), # Duplicate for each item, but persist here to enable retrieve by caller } itemModel["score"] = itemModel[query.sortField] recommendedData.append(itemModel) recommendedData.sort(RowItemFieldComparator( ["score", "clinical_item_id"]), reverse=True) return recommendedData
def parseResidentScheduleItems(self, resident, resTextChunksList, dateRanges, splitDates): """Parse just the text chunks for an individual resident, being aware of potential for list of multiple rows """ provId = self.inferProvIdFromName(resident); for iBlock, dateRange in enumerate(dateRanges): # Iterate through each major rotation block dateRange = dateRanges[iBlock]; splitDate = splitDates[iBlock]; # First pass through to look for specifically dated rotations scheduleItems = list(); for iRow, resTextChunks in enumerate(resTextChunksList): textChunk = resTextChunks[iBlock].strip(); #print >> sys.stderr, iRow, textChunk if textChunk != "" and textChunk[-1].isdigit(): # Ends with a number, must be a date specification subChunks = textChunk.split(); # Separate out date dateRangeText = subChunks.pop(-1); (startText, endText) = dateRangeText.split("-"); # Separate out start from end date startDate = self.parseDateText( startText, dateRange[0], 0 ); endDate = self.parseDateText( endText, dateRange[0], 1 ); rotation = str.join(' ', subChunks); # Reconstruct rotation name scheduleItem = {"prov_id": provId, "name": resident, "rotation": rotation, "start_date": startDate, "end_date": endDate}; scheduleItems.append(scheduleItem); # Second pass to look for rotations without dates specified based on standard dates for iRow, resTextChunks in enumerate(resTextChunksList): textChunk = resTextChunks[iBlock].strip(); if textChunk != "" and not textChunk[-1].isdigit(): # Remaining non-blank items that do not end with a number subChunks = textChunk.split("|"); # See if split into multiple rotations if len(subChunks) > 1: # Multiple rotations within time block rotation = subChunks[0].strip(); if rotation[0].isalpha(): # Ensure starts with a letter, and not special character for blank placeholder (startDate, endDate) = self.compressDateRange( dateRange[0], splitDate, scheduleItems ); # End split date scheduleItem = {"prov_id": provId, "name": resident, "rotation": rotation, "start_date": startDate, "end_date": endDate}; scheduleItems.append(scheduleItem); rotation = subChunks[-1].strip(); if rotation[0].isalpha(): # Ensure starts with a letter, and not special character for blank placeholder (startDate, endDate) = self.compressDateRange( splitDate, dateRange[-1], scheduleItems ); # Start on split date scheduleItem = {"prov_id": provId, "name": resident, "rotation": rotation, "start_date": startDate, "end_date": endDate}; scheduleItems.append(scheduleItem); else: # Single rotation spanning the full block rotation = subChunks[0].strip(); if rotation[0].isalpha(): # Ensure starts with a letter, and not special character for blank placeholder (startDate, endDate) = self.compressDateRange( dateRange[0], dateRange[-1], scheduleItems ); scheduleItem = {"prov_id": provId, "name": resident, "rotation": rotation, "start_date": startDate, "end_date": endDate}; scheduleItems.append(scheduleItem); # Now yield / generate results, but keep sorted in chronologic order scheduleItems.sort( RowItemFieldComparator("start_date") ); for item in scheduleItems: yield item;
def __call__(self, inputFile1, inputFile2, options): # Parse out the files into score models for each row scoreModels1 = self.parseScoreModelsFromFile( inputFile1, scoreCols=[options.scoreCol1]) scoreModels2 = self.parseScoreModelsFromFile( inputFile2, scoreCols=[options.scoreCol2]) # Sort the results by the specified score column and sort order scoreModels1.sort( RowItemFieldComparator(options.scoreCol1, options.descSort1)) scoreModels2.sort( RowItemFieldComparator(options.scoreCol2, options.descSort2)) # Pull out the sorted list of key items for each itemList1 = columnFromModelList(scoreModels1, options.idCol1) itemList2 = columnFromModelList(scoreModels2, options.idCol2) # Calculate available ranked list similarity measures resultDict = dict() resultDict["RBO"] = self.calcRBO(itemList1, itemList2) self.populateQueryCounts(scoreModels1, scoreModels2, resultDict) return resultDict
def __call__(self, inputFile, labelCols, valueCols, matchCols, baseLabels=None): prog = ProgressDots() self.labelCols = labelCols self.valueCols = valueCols self.matchCols = matchCols self.baseLabels = baseLabels labelModelByLabelKey = dict() dataByLabelKey = dict() reader = TabDictReader(inputFile) for rowModel in reader: labelKey = list() labelModel = dict() for labelCol in self.labelCols: labelModel[labelCol] = rowModel[labelCol] labelKey.append(rowModel[labelCol]) labelKey = tuple(labelKey) # Change to immutable object that can be hashed # Copy just items of interest valueModel = {} if self.matchCols: for matchCol in self.matchCols: valueModel[matchCol] = rowModel[matchCol] for valueCol in self.valueCols: try: valueModel[valueCol] = float(rowModel[valueCol]) except ValueError: # Maybe None string, could not parse into a number valueModel[valueCol] = None if labelKey not in dataByLabelKey: labelModelByLabelKey[labelKey] = labelModel dataByLabelKey[labelKey] = list() dataByLabelKey[labelKey].append(valueModel) prog.update() # prog.printStatus(); # Another pass to ensure data is consistently sorted in each group to allow later paired t-tests if self.matchCols: for labelKey, data in dataByLabelKey.iteritems(): data.sort(RowItemFieldComparator(self.matchCols)) # See if looking for only one set of base labeled data to compare the rest against baseLabelKey = None if self.baseLabels is not None: baseLabelKey = tuple(self.baseLabels) # Result pass to compare all group pair-wise combinations prog = ProgressDots() for labelKey0, data0 in dataByLabelKey.iteritems(): prefix0 = "Group0." labelModel0 = labelModelByLabelKey[labelKey0] if baseLabelKey is not None and labelKey0 != baseLabelKey: continue # Skip entries where the base label does not match specified key for labelKey1, data1 in dataByLabelKey.iteritems(): prefix1 = "Group1." labelModel1 = labelModelByLabelKey[labelKey1] result = dict() for labelCol in self.labelCols: result[prefix0 + labelCol] = labelModel0[labelCol] result[prefix1 + labelCol] = labelModel1[labelCol] for valueCol in self.valueCols: # Pull out value column for each data group. Previous, sort by match col to allow paired t-testing # Skip any value pairs if non-numeric / None value values0 = list() values1 = list() for dataItem0, dataItem1 in zip(data0, data1): if dataItem0[valueCol] is not None and dataItem1[ valueCol] is not None: values0.append(dataItem0[valueCol]) values1.append(dataItem1[valueCol]) for summaryFunction in SUMMARY_FUNCTIONS: result[prefix0 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values0) result[prefix1 + valueCol + "." + summaryFunction.__name__] = summaryFunction( values1) for compTest in COMPARISON_TESTS: (t, p) = compTest(values0, values1) if np.isnan(p): p = None # Use more generic expression for NaN / null value result[compTest.__name__ + "." + valueCol] = p yield result prog.update()
def normalizeMixData(self, rxcuiDataByMedId, mixByOrderMedId, convOptions): """Look through the mixture components to compile a consolidated set of medication data """ for orderMedId, mixList in mixByOrderMedId.iteritems(): mixSize = len(mixList) ingredientIds = set() ingredientList = list() for rowModel in mixList: #print >> sys.stderr, rowModel if mixSize == 2 and rowModel["ingredient_type"] == "Base": # Mixture of two ingredients where one is just a base (usually NS or D5W), ignore the base ingredient # Misses edge case where both ingredients are "bases" though that appears to only represent ~18 items out of ~421K med mixes pass else: # Pull out fully normalized component ingredients first subConvOptions = ConversionOptions() subConvOptions.normalizeMixtures = True subConvOptions.includeRouteInDescription = False for ingredientModel in self.normalizeMedIngredients( rxcuiDataByMedId, rowModel, subConvOptions): medId = ingredientModel["medication_id"] if medId not in ingredientIds: ingredientList.append(ingredientModel) ingredientIds.add(medId) # Avoid adding duplicates ingredientCount = len(ingredientList) if ingredientCount <= 1 or convOptions.normalizeMixtures: # Single ingredient or want component active ingredients to each have one record for ingredientModel in ingredientList: ingredientModel["description"] += " (%s)" % ( rowModel["med_route"]) if convOptions.doseCountLimit is not None and ingredientModel[ "number_of_doses"] is not None: if ingredientModel[ "number_of_doses"] < convOptions.doseCountLimit: ingredientModel[ "code"] += " (<%d)" % convOptions.doseCountLimit ingredientModel[ "description"] += " (<%d doses)" % convOptions.doseCountLimit yield ingredientModel elif convOptions.maxMixtureCount is None or ingredientCount <= convOptions.maxMixtureCount: # Composite into single denormalized item ingredientList.sort(RowItemFieldComparator("description")) # Ensure stable sort order idStrList = list() descriptionList = list() for ingredientModel in ingredientList: medId = ingredientModel["medication_id"] idStrList.append(str(medId)) descriptionList.append(ingredientModel["description"]) idComposite = str.join(",", idStrList) descriptionComposite = str.join("-", descriptionList) # Build on last mix item's row model # Create arbitrary integer, hash to try to be unique # https://stackoverflow.com/questions/16008670/python-how-to-hash-a-string-into-8-digits number = int(hashlib.sha1(idComposite).hexdigest(), 16) % (10** 12) rowModel["medication_id"] = number rowModel["code"] = RXCUI_CODE_TEMPLATE % idComposite # Hard to trace back to Order_Med.medication_id from here, since working with Order_Med_MixInfo records #rowModel["code"] = GENERIC_CODE_TEMPLATE % rowModel["medication_id"]; rowModel["description"] = "%s (%s)" % (descriptionComposite, rowModel["med_route"]) if convOptions.doseCountLimit is not None and rowModel[ "number_of_doses"] is not None: if rowModel["number_of_doses"] < convOptions.doseCountLimit: rowModel[ "code"] += " (<%d)" % convOptions.doseCountLimit rowModel[ "description"] += " (<%d doses)" % convOptions.doseCountLimit yield rowModel else: # ingredientCount > convOptions.maxMixtureCount. Too many components, don't try to use mixture, defer to summary label pass
def __call__(self, query): # Given query items, use model to find related topics with relationship scores # Load item category lookup information if self.itemsById is None: self.initItemLookups(query) # Load model weight parameters once to save time on serial queries if self.weightByItemIdByTopicId is None: self.weightByItemIdByTopicId = self.modeler.generateWeightByItemIdByTopicId( self.model, query.itemsPerCluster) # Adapt query into bag-of-words format queryItemCountById = query.queryItemIds if not isinstance( queryItemCountById, dict ): # Not a dictionary, probably a one dimensional list/set, then just add counts of 1 itemIds = queryItemCountById queryItemCountById = dict() for itemId in itemIds: queryItemCountById[itemId] = 1 observedIds = set() queryBag = list( self.modeler.itemCountByIdToBagOfWords(queryItemCountById, observedIds, self.itemsById, query.excludeCategoryIds)) # Primary model execute. Apply to query to generate scored relationship to each "topic" topicWeights = self.model[queryBag] weightByTopicId = dict() for (topicId, topicWeight) in topicWeights: weightByTopicId[topicId] = topicWeight # Composite scores for (recommendable) items by taking weighted average across the top items for each topic recScoreByItemId = dict() for itemId in self.candidateItemIds: if self.isItemRecommendable(itemId, queryItemCountById, query, self.categoryIdByItemId): recScoreByItemId[itemId] = 0.0 for topicId, topicWeight in weightByTopicId.iteritems(): if topicWeight > query.minClusterWeight: # Ignore topics with tiny contribution weightByItemId = self.weightByItemIdByTopicId[topicId] for itemId in recScoreByItemId.keys(): itemWeight = 0.0 if itemId in weightByItemId: itemWeight = weightByItemId[itemId] recScoreByItemId[itemId] += topicWeight * itemWeight # Build 2-pls with lists to sort by score recommendedData = list() for itemId, totalItemWeight in recScoreByItemId.iteritems(): tfidf = 0.0 if itemId in self.docCountByWordId and self.docCountByWordId[ itemId] > 0.0: tfidf = totalItemWeight * self.docCountByWordId[ None] / self.docCountByWordId[itemId] # Scale TF*IDF score based on baseline document counts to prioritize disproportionately common items itemModel = \ { "totalItemWeight": totalItemWeight, "tf": totalItemWeight, "PPV": totalItemWeight, "P(item|query)": totalItemWeight, "P(B|A)": totalItemWeight, "tfidf": tfidf, "lift": tfidf, "interest": tfidf, "P(item|query)/P(item)": tfidf, "P(B|A)/P(B)": tfidf, "clinical_item_id": itemId, "weightByTopicId": weightByTopicId, "numSelectedTopics": len(weightByTopicId), # Duplicate for each item, but persist here to enable retrieve by caller } itemModel["score"] = itemModel[query.sortField] recommendedData.append(itemModel) recommendedData.sort(RowItemFieldComparator("score"), reverse=True) return recommendedData
def analyzePatientItems(self, patientItemData, analysisQuery, recQuery, patientId, recommender, conn): """Given the primary query data and clinical item list for a given test patient, Parse through the item list and run a query to get the top recommended IDs to produce the relevant verify and recommendation item ID sets for comparison """ if "queryItemCountById" not in patientItemData: # Apparently not able to find / extract relevant data, so skip this record return None queryItemCountById = patientItemData["queryItemCountById"] verifyItemCountById = patientItemData["verifyItemCountById"] ## Query for orderset linked items orderSetQuery = \ """ select ic.external_id, pi.clinical_item_id, pi.item_date >= %(p)s as is_verify_item from patient_item as pi, patient_item_collection_link as picl, item_collection_item as ici, item_collection as ic where patient_id = %(p)s and pi.patient_item_id = picl.patient_item_id and picl.item_collection_item_id = ici.item_collection_item_id and ici.item_collection_id = ic.item_collection_id and ic.section <> %(p)s and item_date >= %(p)s and item_date < %(p)s """ % {"p": DBUtil.SQL_PLACEHOLDER} orderSetParams = ( patientItemData["queryEndTime"], patientItemData["patient_id"], AD_HOC_SECTION, patientItemData["baseItemDate"], patientItemData["verifyEndTime"], ) resultTable = DBUtil.execute(orderSetQuery, orderSetParams, includeColumnNames=True, conn=conn) allOrderSetItems = modelListFromTable(resultTable) orderSetItemsByOrderSetId = dict() for orderSetItem in allOrderSetItems: orderSetId = orderSetItem["external_id"] if orderSetId not in orderSetItemsByOrderSetId: orderSetItemsByOrderSetId[orderSetId] = list() orderSetItemsByOrderSetId[orderSetId].append(orderSetItem) keyOrderSetIds = orderSetItemsByOrderSetId.keys() if analysisQuery.numRecsByOrderSet: # Only use the specified key order set for each set of patient data orderSetId = patientItemData["order_set_id"] if orderSetId not in keyOrderSetIds: # No valid order set orders to use in this setting. Skip this case return None keyOrderSetIds = [orderSetId] # Pre-cache order set item data if self.supportRecommender.itemIdsByOrderSetId is None: self.supportRecommender.initItemLookups(analysisQuery.baseRecQuery) # For each order set, count up how many order set linked items used. # Count up how many items used indirectly that would have been within order set. # Organize by whether the item occurred during the "verify" vs. "query" time period usedItemIdsByOrderSetIdByIsVerifyItem = { True: dict(), False: dict() } for keyOrderSetId in keyOrderSetIds: orderSetItems = orderSetItemsByOrderSetId[keyOrderSetId] for orderSetItem in orderSetItems: isVerifyItem = orderSetItem["is_verify_item"] orderSetId = orderSetItem["external_id"] itemId = orderSetItem["clinical_item_id"] usedItemIdsByOrderSetId = usedItemIdsByOrderSetIdByIsVerifyItem[ isVerifyItem] if orderSetId not in usedItemIdsByOrderSetId: usedItemIdsByOrderSetId[orderSetId] = set() usedItemIdsByOrderSetId[orderSetId].add(itemId) # Summarize into total number of (unique) items available from used order sets, and which of those items were actually used from the order set allUsedOrderSetItemIds = set() allUsedOrderSetIds = set() recommendableUsedOrderSetItemIds = set() allAvailableOrderSetItemIds = set() allAvailableVerifyOrderSetItemIds = set() recommendableAvailableOrderSetItemIds = set() for isVerifyItem, usedItemIdsByOrderSetId in usedItemIdsByOrderSetIdByIsVerifyItem.iteritems( ): for orderSetId, usedItemIds in usedItemIdsByOrderSetId.iteritems(): allUsedOrderSetIds.add(orderSetId) if isVerifyItem: allAvailableVerifyOrderSetItemIds.update( self.supportRecommender.itemIdsByOrderSetId[orderSetId] ) for itemId in usedItemIds: if self.supportRecommender.isItemRecommendable( itemId, None, recQuery, self.supportRecommender.categoryIdByItemId): recommendableUsedOrderSetItemIds.add(itemId) allUsedOrderSetItemIds.update(usedItemIds) for itemId in self.supportRecommender.itemIdsByOrderSetId[ orderSetId]: if self.supportRecommender.isItemRecommendable( itemId, None, recQuery, self.supportRecommender.categoryIdByItemId): recommendableAvailableOrderSetItemIds.add(itemId) allAvailableOrderSetItemIds.update( self.supportRecommender.itemIdsByOrderSetId[orderSetId]) # Treat available order set items from the verify time period like "recommended data" recommendedData = list() for itemId in allAvailableVerifyOrderSetItemIds: if not self.supportRecommender.isItemRecommendable( itemId, None, recQuery, self.supportRecommender.categoryIdByItemId): continue # Skip items that do not fit recommendable criteria (i.e., excluded categories) for fair comparison recItemModel = dict(self.supportRecommender.itemsById[itemId]) recItemModel["score"] = recItemModel[recQuery.sortField] recommendedData.append(recItemModel) recommendedData.sort(RowItemFieldComparator(recQuery.sortField), reverse=True) # Distill down to just the set of recommended item IDs recommendedItemIds = set() for i, recommendationModel in enumerate(recommendedData): if analysisQuery.numRecommendations > 0 and i >= analysisQuery.numRecommendations: break recommendedItemIds.add(recommendationModel["clinical_item_id"]) # Summary metrics on how many order items in query and verify periods are recommendable # Outer join query for order set items should work same way, but maybe simpler to follow as second query itemQuery = \ """ select pi.clinical_item_id, pi.item_date >= %(p)s as is_verify_item from patient_item as pi where patient_id = %(p)s and item_date >= %(p)s and item_date < %(p)s """ % {"p": DBUtil.SQL_PLACEHOLDER} itemParams = ( patientItemData["queryEndTime"], patientItemData["patient_id"], patientItemData["baseItemDate"], patientItemData["verifyEndTime"], ) resultTable = DBUtil.execute(itemQuery, itemParams, conn=conn) recommendableQueryItemIds = set() recommendableVerifyItemIds = set() for itemId, isVerifyItem in resultTable: if self.supportRecommender.isItemRecommendable( itemId, None, recQuery, self.supportRecommender.categoryIdByItemId): if not isVerifyItem: recommendableQueryItemIds.add(itemId) else: recommendableVerifyItemIds.add(itemId) # Order Set Usage Summary Data orderSetItemData = \ { "allUsedOrderSetIds": allUsedOrderSetIds, "allUsedOrderSetItemIds": allUsedOrderSetItemIds, "allAvailableOrderSetItemIds": allAvailableOrderSetItemIds, "recommendableUsedOrderSetItemIds": recommendableUsedOrderSetItemIds, "recommendableAvailableOrderSetItemIds": recommendableAvailableOrderSetItemIds, "recommendableQueryItemIds": recommendableQueryItemIds, "recommendableVerifyItemIds": recommendableVerifyItemIds, } return (queryItemCountById, verifyItemCountById, recommendedItemIds, recommendedData, orderSetItemData)