def action_default(self): query = RecommenderQuery() query.parseParams(self.requestData) displayFields = query.getDisplayFields() recommendedData = self.recommender(query) if len(recommendedData) > 0: # Denormalize results with links to clinical item descriptions self.recommender.formatRecommenderResults(recommendedData) # Format for HTML and add a control field for interaction with the data for dataModel in recommendedData: self.prepareResultRow(dataModel, displayFields) # Display fields should append Format suffix to identify which version to display, but use original for header labels (self.requestData["fieldHeaders"], displayFieldsFormatSuffixed ) = self.prepareDisplayHeaders(displayFields) colNames = [ "controls", "rank", "name", "description", "category_description" ] colNames.extend(displayFieldsFormatSuffixed) formatter = HtmlResultsFormatter(StringIO(), valign="middle", align="center") formatter.formatResultDicts(recommendedData, colNames) self.requestData["dataRows"] = formatter.getOutFile().getvalue()
def test_recommender_aggregation(self): # Test different scoring aggregation methods query = RecommenderQuery() query.countPrefix = "patient_" query.queryItemIds = set([-2, -5]) #query.excludeItemIds = set(); #query.categoryIds = set(); #query.timeDeltaMax = None; # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent. If left blank, will just consider all items within a given patient as co-occurrent. query.limit = 3 # Just get top 3 ranks for simplicity query.maxRecommendedId = 0 # Artificial constraint to focus only on test data headers = ["clinical_item_id", "conditionalFreq", "freqRatio"] # Default weighted aggregation method expectedData = \ [ RowItemModel( [-4, 0.3, 22.5], headers ), RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Change to unweighted aggregation method query.aggregationMethod = "unweighted" expectedData = \ [ RowItemModel( [-4, 0.32857, 24.64286], headers ), RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Change to Serial Bayes aggregation method query.aggregationMethod = "SerialBayes" expectedData = \ [ RowItemModel( [-4, 0.89157, 66.867471], headers ), RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Naive Bayes aggregation query.aggregationMethod = "NaiveBayes" expectedData = \ [ RowItemModel( [-4, 3.75, 281.25], headers ), # Without truncating negative values #RowItemModel( [-4, 0.8, 58.59707], headers ), # With truncating negative values RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) # Apply value filter query.fieldFilters["freqRatio>"] = 10.0 expectedData = \ [ RowItemModel( [-6, 0.16667, 7.142857], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query)
def test_tripleSequence_virtualItem(self): # Test outcome assessment when the target is a virtual item based on the presence of a triple (instead of double) sequence of items # Run the recommender against the mock test data above and verify expected stats afterwards. analysisQuery = AnalysisQuery() analysisQuery.patientIds = set([-22222]) analysisQuery.baseCategoryId = -7 analysisQuery.queryTimeSpan = timedelta(0, 86400) analysisQuery.sequenceItemIdsByVirtualItemId[-16] = (-15, -14) #analysisQuery.recommender = BaselineFrequencyRecommender(); analysisQuery.recommender = ItemAssociationRecommender() analysisQuery.baseRecQuery = RecommenderQuery() analysisQuery.baseRecQuery.targetItemIds = set([-16]) analysisQuery.baseRecQuery.maxRecommendedId = 0 # Restrict to test data # Initial run without time limits on outcome measure colNames = ["patient_id", "outcome.-16", "score.-16"] expectedResults = [RowItemModel([-22222, +1, 0.14286], colNames)] analysisResults = self.analyzer(analysisQuery) self.assertEqualStatResults(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "OutcomePredictionAnalysis.py", "-c", "-7", "-Q", "86400", "-o", "-16=-15:-14", "-m", "0", "-R", "ItemAssociationRecommender", '0,-22222', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Validation file in prepared result file format use generated LDA models to predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\ " <outputFile> Validation result stat summaries.\n" parser = OptionParser(usage=usageStr) parser.add_option("-M", "--modelFile", dest="modelFile", help="Name of the file to load an LDA or HDP model and topic word document counts from."); parser.add_option("-X", "--excludeCategoryIds", dest="excludeCategoryIds", help="For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids."); parser.add_option("-i", "--itemsPerCluster", dest="itemsPerCluster", default=DEFAULT_TOPIC_ITEM_COUNT, help="Specify number of top topic items to consider when scoring recommendations."); parser.add_option("-m", "--minClusterWeight", dest="minClusterWeight", default=DEFAULT_MIN_TOPIC_WEIGHT, help="When scoring recommendations, skip any topics with less than this relation weight (effectively scores as zero, but can avoid a lot of low yield calculations)."); parser.add_option("-s", "--sortField", dest="sortField", default=DEFAULT_SORT_FIELD, help="Score field to sort top recommendations by. Default to posterior probabilty 'totelItemWeight', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting."); parser.add_option("-r", "--numRecs", dest="numRecs", default=DEFAULT_RECOMMENDED_ITEM_COUNT, help="Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size."); parser.add_option("-O", "--numRecsByOrderSet", dest="numRecsByOrderSet", action="store_true", help="If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider."); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if len(args) >= 1: query = AnalysisQuery(); query.preparedPatientItemFile = stdOpen(args[0]); query.recommender = TopicModelRecommender(options.modelFile); query.baseRecQuery = RecommenderQuery(); if options.excludeCategoryIds is not None: query.baseRecQuery.excludeCategoryIds = set(); for categoryIdStr in options.executeCategoryIds.split(","): query.baseRecQuery.excludeCategoryIds.add(int(categoryIdStr)); else: # Default exclusions if none specified query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(); query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(); query.baseRecQuery.itemsPerCluster = int(options.itemsPerCluster); query.baseRecQuery.minClusterWeight = float(options.minClusterWeight); query.baseRecQuery.sortField = options.sortField; query.numRecommendations = int(options.numRecs); query.numRecsByOrderSet = options.numRecsByOrderSet; # Run the actual analysis analysisResults = self(query); # Format the results for output outputFilename = None; if len(args) > 1: outputFilename = args[1]; outputFile = stdOpen(outputFilename,"w"); # Print comment line with analysis arguments to allow for deconstruction later summaryData = {"argv": argv}; print(COMMENT_TAG, json.dumps(summaryData), file=outputFile); formatter = TextResultsFormatter( outputFile ); colNames = self.resultHeaders(query); formatter.formatTuple( colNames ); # Insert a mock record to get a header / label row formatter.formatResultDicts( analysisResults, colNames ); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);
def test_recommenderAnalysis(self): # Run the recommender against the mock test data above and verify expected stats afterwards. analysisQuery = AnalysisQuery() analysisQuery.patientIds = set([-11111]) analysisQuery.recommender = BaselineFrequencyRecommender() #analysisQuery.recommender = ItemAssociationRecommender(); analysisQuery.baseRecQuery = RecommenderQuery() analysisQuery.baseRecQuery.maxRecommendedId = 0 # Restrict to test data # Don't use items whose default is to be excluded from recommendations #recQuery.excludeCategoryIds = recommender.defaultExcludedClinicalItemCategoryIds(conn=conn); #recQuery.excludeItemIds = recommender.defaultExcludedClinicalItemIds(conn=conn); #recQuery.timeDeltaMax = timedelta(0, int(self.requestData["timeDeltaMax"]) ); # Time delta to use for queries, otherwise just default to all times colNames = [ "patient_id", "clinical_item_id", "iItem", "iRecItem", "recRank", "recScore" ] # Start with default recommender expectedResults = \ [ (-11111, -4, 0, 0, 1, SENTINEL_ANY_FLOAT), #0.170), Don't care about specific scores, as long as ranks are correct (-11111,-10, 1, 1, 4, SENTINEL_ANY_FLOAT), #0.032), (-11111, -8, 2, 2, 5, SENTINEL_ANY_FLOAT), #0.025), (-11111,-12, 4, 3, 2, SENTINEL_ANY_FLOAT), #0.053), ] analysisResults = self.analyzer(analysisQuery) self.assertEqualTable(expectedResults, analysisResults, 3) # Now try targeted recommender analysisQuery.recommender = ItemAssociationRecommender() expectedResults = \ [ (-11111, -4, 0, 0, 1, SENTINEL_ANY_FLOAT), #0.167), (-11111,-10, 1, 1, 2, SENTINEL_ANY_FLOAT), #0.304), (-11111, -8, 2, 2, 5, SENTINEL_ANY_FLOAT), #0.190), (-11111,-12, 4, 3, 1, SENTINEL_ANY_FLOAT), #0.444), ] analysisResults = self.analyzer(analysisQuery) self.assertEqualTable(expectedResults, analysisResults, 3) # Repeat, but put a limit on maximum number of query items and recommendations we want analyzed analysisQuery.queryItemMax = 2 expectedResults = \ [ (-11111, -4, 0, 0, 1, SENTINEL_ANY_FLOAT), #0.167), (-11111,-10, 1, 1, 2, SENTINEL_ANY_FLOAT), #0.304), ] analysisResults = self.analyzer(analysisQuery) self.assertEqualTable(expectedResults, analysisResults, 3)
def test_recommender_stats(self): # Run the recommender against the mock test data above and verify expected stats calculations query = RecommenderQuery() query.parseParams \ ( { "countPrefix": "patient_", "queryItemIds": "-6", "resultCount": "3", # Just get top 3 ranks for simplicity "maxRecommendedId": "0", # Artificial constraint to focus only on test data "sortField": "P-Fisher", # Specifically request derived expected vs. observed stats } ) log.debug("Query with single item not perturbed by others.") headers = [ "clinical_item_id", "N", "nB", "nA", "nAB", "conditionalFreq", "baselineFreq", "freqRatio", "P-Fisher" ] expectedData = \ [ RowItemModel( [-2, SIMULATED_PATIENT_COUNT, 30.0, 70.0, 7.0, 0.1, 0.0100, 10.0, 3.7e-06], headers ), RowItemModel( [-4, SIMULATED_PATIENT_COUNT, 40.0, 70.0, 20.0, 0.286, 0.0133, 21.42857, 1.2e-23], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedDataStats(expectedData, recommendedData, headers) log.debug("Query for non-unique counts.") query.parseParams \ ( { "countPrefix": "", "sortField": "oddsRatio", } ) headers = [ "clinical_item_id", "N", "nB", "nA", "nAB", "conditionalFreq", "baselineFreq", "freqRatio", "oddsRatio" ] expectedData = \ [ RowItemModel( [-4, SIMULATED_PATIENT_COUNT, 40.0, 70.0, 25.0, 0.35714, 0.01333, 26.7857, 107.96296], headers ), RowItemModel( [-2, SIMULATED_PATIENT_COUNT, 30.0, 70.0, 12.0, 0.1714, 0.01, 17.1429, 33.47126], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedDataStats(expectedData, recommendedData, headers)
def test_parsePreparedResultFile(self): # Run the analysis preparer against the mock test data above and verify can parse back text file into original object form. # Key columns to verify colNames = \ [ "patient_id", "baseItemId", "baseItemDate", "queryStartTime", "queryEndTime", "verifyEndTime", "queryItemCountById", "verifyItemCountById", "outcome.-33","outcome.-32", "outcome.-31","outcome.-30" ] # Initial run without time limits on outcome measure analysisQuery = AnalysisQuery() analysisQuery.patientIds = set([-11111, -44444]) analysisQuery.baseCategoryId = -7 analysisQuery.queryTimeSpan = timedelta(0, 86400) analysisQuery.verifyTimeSpan = timedelta(0, 604800) analysisQuery.baseRecQuery = RecommenderQuery() analysisQuery.baseRecQuery.targetItemIds = set([-33, -32, -31, -30]) analysisQuery.baseRecQuery.excludeItemIds = [-13] analysisQuery.baseRecQuery.excludeCategoryIds = [-5] analysisQuery.baseRecQuery.maxRecommendedId = 0 # Restrict to test data directResults = list(self.analyzer(analysisQuery)) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "PreparePatientItems.py", "-c", "-7", "-Q", "86400", "-V", "604800", "-o", "-33,-32,-31,-30", '0,-11111,-44444', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) textBasedResults = list( self.analyzer.parsePreparedResultFile(textOutput)) self.assertEqualResultDicts(directResults, textBasedResults, colNames)
def action_default(self): """Look for related orders by association / recommender methods""" # If patient is specified then modify query and exclusion list based on items already ordered for patient recentItemIds = set() if self.requestData["sim_patient_id"]: patientId = int(self.requestData["sim_patient_id"]) simTime = int(self.requestData["sim_time"]) # Track recent item IDs (orders, diagnoses, unlocked results, etc. that related order queries will be based off of) manager = SimManager() recentItemIds = manager.recentItemIds(patientId, simTime) # Recommender Instance to test on self.recommender = ItemAssociationRecommender() self.recommender.dataManager.dataCache = webDataCache # Allow caching of data for rapid successive queries query = RecommenderQuery() if self.requestData["sortField"] == "": self.requestData["sortField"] = "P-YatesChi2-NegLog" # P-Fisher-NegLog should yield better results, but beware, much longer to calculate query.parseParams(self.requestData) if len(query.excludeItemIds) == 0: query.excludeItemIds = self.recommender.defaultExcludedClinicalItemIds( ) if len(query.excludeCategoryIds) == 0: query.excludeCategoryIds = self.recommender.defaultExcludedClinicalItemCategoryIds( ) #query.fieldList.extend( ["prevalence","PPV","RR"] ); displayFields = list() if self.requestData["displayFields"] != "": displayFields = self.requestData["displayFields"].split(",") # Exclude items already ordered for the patient from any recommended list query.excludeItemIds.update(recentItemIds) if not query.queryItemIds: # If no specific query items specified, then use the recent patient item IDs query.queryItemIds.update(recentItemIds) recommendedData = self.recommender(query) if len(recommendedData) > 0: # Denormalize results with links to clinical item descriptions self.recommender.formatRecommenderResults(recommendedData) # Display fields should append Format suffix to identify which version to display, but use original for header labels (self.requestData["fieldHeaders"], displayFieldsFormatSuffixed ) = self.prepareDisplayHeaders(displayFields) # Format for HTML and add a control field for interaction with the data for dataModel in recommendedData: self.prepareResultRow(dataModel, displayFields) # Try organize by category if self.requestData["groupByCategory"]: recommendedData = self.recommender.organizeByCategory( recommendedData) colNames = ["controls"] # "name" for code. ,"category_description" colNames.extend(displayFieldsFormatSuffixed) colNames.extend(["description"]) lastModel = None htmlLines = list() for dataModel in recommendedData: newCategory = (lastModel is None or lastModel["category_description"] != dataModel["category_description"]) showCategory = (self.requestData["groupByCategory"] and newCategory) # Limit category display if many repeats if showCategory: htmlLines.append(CATEGORY_HEADER_TEMPLATE % dataModel) htmlLines.append( self.formatRowHTML(dataModel, colNames, showCategory)) lastModel = dataModel self.requestData["dataRows"] = str.join("\n", htmlLines)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Validation file in prepared result file format. Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\ " <outputFile> Validation result stat summaries.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-r", "--numRecs", dest="numRecs", default=DEFAULT_RECOMMENDED_ITEM_COUNT, help= "Number of orders / items to recommend for comparison against the verification set, sorted in prevalence order. If skip or set <1, then will use all order set items found." ) parser.add_option( "-O", "--numRecsByOrderSet", dest="numRecsByOrderSet", action="store_true", help= "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider." ) parser.add_option( "-s", "--sortField", dest="sortField", default=DEFAULT_SORT_FIELD, help= "Allow overriding of default sort field when returning ranked results (patient_count, name, description, etc.)" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) >= 1: query = AnalysisQuery() query.preparedPatientItemFile = stdOpen(args[0]) query.recommender = OrderSetRecommender() query.baseRecQuery = RecommenderQuery() # Default exclusions if none specified query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds( ) query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds( ) query.baseRecQuery.sortField = options.sortField query.numRecommendations = int(options.numRecs) query.numRecsByOrderSet = options.numRecsByOrderSet # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) formatter = TextResultsFormatter(outputFile) colNames = self.resultHeaders(query) formatter.formatTuple(colNames) # Insert a mock record to get a header / label row formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def test_recommenderAnalysis(self): # Run the recommender against the mock test data above and verify expected stats afterwards. analysisQuery = AnalysisQuery(); analysisQuery.patientIds = set([-11111]); analysisQuery.numQueryItems = 1; analysisQuery.numVerifyItems = 3; analysisQuery.numRecommendations = 4; analysisQuery.recommender = BaselineFrequencyRecommender(); #analysisQuery.recommender = ItemAssociationRecommender(); analysisQuery.baseRecQuery = RecommenderQuery(); analysisQuery.baseRecQuery.maxRecommendedId = 0; # Restrict to test data # Don't use items whose default is to be excluded from recommendations analysisQuery.baseRecQuery.excludeCategoryIds = analysisQuery.recommender.defaultExcludedClinicalItemCategoryIds(); analysisQuery.baseRecQuery.excludeItemIds = analysisQuery.recommender.defaultExcludedClinicalItemIds(); #recQuery.timeDeltaMax = timedelta(0, int(self.requestData["timeDeltaMax"]) ); # Time delta to use for queries, otherwise just default to all times colNames = ["patient_id", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall","weightPrecision", "normalRecall","normalPrecision", "ROC-AUC"]; # Start with default recommender expectedResults = [ RowItemModel([-11111, 1,2,3, 0.333, 0.25, 0.286, 0.208, 0.254, 0.333/1.0, 0.25/0.75, 0.524], colNames ) ]; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line interface sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-q","1","-v","3","-r","4","-m","0","-R","BaselineFrequencyRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","1","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","BaselineFrequencyRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Now try targeted recommender analysisQuery.recommender = ItemAssociationRecommender(); expectedResults = [ RowItemModel([-11111, 1,2,3, 0.333, 0.25, 0.286, 0.347, 0.293, 0.333, 0.25/0.75, 0.6666], colNames ) ]; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-q","1","-v","3","-r","4","-m","0","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","1","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Now try multiple query items targeted recommender analysisQuery.numQueryItems = 2; expectedResults = [ RowItemModel([-11111, 1, 2, 3, 0.333, 0.25, 0.286, 0.254, 0.194, 0.333, 0.25/0.75, 0.4167], colNames ) ]; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-q","2","-v","3","-r","4","-m","0","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","2","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # More query items with aggregation options analysisQuery.numQueryItems = 3; expectedResults = [ RowItemModel([-11111, 1, 1, 3, 0.5, 0.25, 0.333, 0.517, 0.194, 0.5, 0.25/0.5, 0.4166], colNames ) ]; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-q","3","-v","3","-r","4","-m","0","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","3","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Value filters analysisQuery.baseRecQuery.sortField= "freqRatio"; analysisQuery.baseRecQuery.fieldFilters["freqRatio>"] = 70; expectedResults = [ RowItemModel([-11111, 2, 0, 2, 1.0, 0.5, 0.6666, 1.0, 0.446, 1.0, 0.5/0.5, 0.375], colNames ) ]; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); del analysisQuery.baseRecQuery.fieldFilters["freqRatio>"]; # Undo to not affect subsequent queries # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-s","freqRatio","-f","freqRatio>:70.0","-q","3","-v","3","-r","4","-m","0","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","3","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender","-s","freqRatio","-f","freqRatio>:70.0",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Unweighted aggregation analysisQuery.baseRecQuery.weightingMethod = "unweighted"; expectedResults = [ RowItemModel([-11111, 1, 1, 3, 0.5, 0.25, 0.3333, 0.517, 0.194, 0.5, 0.25/0.5, 0.25], colNames ) ]; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-s","freqRatio","-q","3","-v","3","-r","4","-m","0","-R","ItemAssociationRecommender","-a","unweighted",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","3","-v","3",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-s","freqRatio","-P","-r","4","-m","0","-R","ItemAssociationRecommender","-a","unweighted",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Run by equivalent query time span selection rather than explicit counts colNames = ["patient_id", "baseItemId", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"]; expectedResults = [ RowItemModel([-11111, -4, 1, 1, 3, 0.5, 0.25, 0.333, 0.517, 0.194, 0.4167], colNames ) ]; analysisQuery.baseRecQuery.sortField= "conditionalFreq"; analysisQuery.numQueryItems = None; analysisQuery.numVerifyItems = None; analysisQuery.baseCategoryId = -1; analysisQuery.queryTimeSpan = timedelta(0,3*60*60); analysisQuery.verifyTimeSpan = timedelta(50,0); analysisQuery.numRecommendations = 4; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-c","-1","-Q","5400","-V","4320000","-r","4","-m","0","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-c","-1","-Q","5400","-V","4320000",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Run by query time span by identifying base clinical item, rather than a general category analysisQuery.numQueryItems = None; analysisQuery.numVerifyItems = None; analysisQuery.baseCategoryId = None; # Clear prior setting analysisQuery.baseItemId = -4; analysisQuery.queryTimeSpan = timedelta(0,3*60*60); analysisQuery.verifyTimeSpan = timedelta(50,0); analysisQuery.numRecommendations = 4; analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-b","-4","-Q","5400","-V","4320000","-r","4","-m","0","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-b","-4","-Q","5400","-V","4320000",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Basic then Filter test data date range colNames = ["patient_id", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"]; expectedResults = [ RowItemModel([-11111, 1, 1, 3, 0.5, 0.25, 0.33333, 0.4375, 0.29319, 0.66667], colNames ) ]; analysisQuery = AnalysisQuery(); analysisQuery.patientIds = set([-11111]); analysisQuery.numQueryItems = 1; analysisQuery.numVerifyItems = 2; analysisQuery.numRecommendations = 4; analysisQuery.recommender = ItemAssociationRecommender(); analysisQuery.baseRecQuery = RecommenderQuery(); analysisQuery.baseRecQuery.maxRecommendedId = 0; # Restrict to test data analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-q","1","-v","2","-r","4","-m","0","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","1","-v","2",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Date Filters colNames = ["patient_id", "TP", "FN", "FP", "recall", "precision", "F1-score", "weightRecall","weightPrecision", "ROC-AUC"]; expectedResults = [ RowItemModel([-11111, 0, 1, 2, 0.0, 0.0, 0.0, 0.0, 0.0, None], colNames ) ]; analysisQuery = AnalysisQuery(); analysisQuery.patientIds = set([-11111]); analysisQuery.numQueryItems = 1; analysisQuery.numVerifyItems = 2; analysisQuery.numRecommendations = 4; analysisQuery.recommender = ItemAssociationRecommender(); analysisQuery.baseRecQuery = RecommenderQuery(); analysisQuery.baseRecQuery.maxRecommendedId = 0; # Restrict to test data analysisQuery.startDate = datetime(2000,1,1,1); analysisQuery.endDate = datetime(2000,1,10); analysisResults = self.analyzer(analysisQuery); self.assertEqualStatResults(expectedResults, analysisResults, colNames); # Redo with command-line sys.stdout = StringIO(); # Redirect stdout output to collect test results argv = ["RecommendationClassificationAnalysis.py","-q","1","-v","2","-r","4","-m","0","-S","2000-01-01 01:00:00","-E","2000-01-10","-R","ItemAssociationRecommender",'0,-11111',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames); # Redo through prepared file intermediary sys.stdout = StringIO(); argv = ["PreparePatientItems.py","-q","1","-v","2","-S","2000-01-01 01:00:00","-E","2000-01-10",'0,-11111',"-"]; self.preparer.main(argv); preparedDataFile = StringIO(sys.stdout.getvalue()); sys.stdin = preparedDataFile; # Read prepared data file from redirected stdin sys.stdout = StringIO(); argv = ["RecommendationClassificationAnalysis.py","-P","-r","4","-m","0","-R","ItemAssociationRecommender",'-',"-"]; self.analyzer.main(argv); textOutput = StringIO(sys.stdout.getvalue()); self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames);
def action_default(self): """Look for related orders by association / recommender methods""" self.recommender = ItemAssociationRecommender() # Instance to test on self.recommender.dataManager.dataCache = webDataCache query = RecommenderQuery() if self.requestData["sortField"] == "": self.requestData["sortField"] = "P-YatesChi2-NegLog" # P-Fisher-NegLog should yield better results, but beware, much longer to calculate query.parseParams(self.requestData) if len(query.excludeItemIds) == 0: query.excludeItemIds = self.recommender.defaultExcludedClinicalItemIds( ) if len(query.excludeCategoryIds) == 0: query.excludeCategoryIds = self.recommender.defaultExcludedClinicalItemCategoryIds( ) #query.fieldList.extend( ["prevalence","PPV","RR"] ); displayFields = list() if self.requestData["displayFields"] != "": displayFields = self.requestData["displayFields"].split(",") recommendedData = self.recommender(query) if len(recommendedData) > 0: # Denormalize results with links to clinical item descriptions self.recommender.formatRecommenderResults(recommendedData) # Display fields should append Format suffix to identify which version to display, but use original for header labels (self.requestData["fieldHeaders"], displayFieldsFormatSuffixed ) = self.prepareDisplayHeaders(displayFields) # Format for HTML and add a control field for interaction with the data for dataModel in recommendedData: self.prepareResultRow(dataModel, displayFields) # Try organize by category if self.requestData["groupByCategory"]: recommendedData = self.recommender.organizeByCategory( recommendedData) colNames = ["controls"] # "name" for code. ,"category_description" colNames.extend(displayFieldsFormatSuffixed) colNames.extend(["description"]) lastModel = None htmlLines = list() for dataModel in recommendedData: newCategory = (lastModel is None or lastModel["category_description"] != dataModel["category_description"]) showCategory = (self.requestData["groupByCategory"] and newCategory) # Limit category display if many repeats if showCategory: htmlLines.append(CATEGORY_HEADER_TEMPLATE % dataModel) htmlLines.append( self.formatRowHTML(dataModel, colNames, showCategory)) lastModel = dataModel self.requestData["dataRows"] = str.join("\n", htmlLines)
def test_recommenderAnalysis(self): # Run the recommender against the mock test data above and verify expected stats afterwards. analysisQuery = AnalysisQuery() analysisQuery.patientIds = set([-11111]) analysisQuery.baseCategoryId = -7 analysisQuery.queryTimeSpan = timedelta(0, 86400) #analysisQuery.recommender = BaselineFrequencyRecommender(); analysisQuery.recommender = ItemAssociationRecommender() analysisQuery.baseRecQuery = RecommenderQuery() analysisQuery.baseRecQuery.targetItemIds = set([-33, -32, -31, -30]) analysisQuery.baseRecQuery.maxRecommendedId = 0 # Restrict to test data # Initial run without time limits on outcome measure colNames = [ "patient_id", "outcome.-33", "score.-33", "outcome.-32", "score.-32", "outcome.-31", "score.-31", "outcome.-30", "score.-30" ] expectedResults = [ RowItemModel([-11111, +0, 0.222, +2, 0.611, +1, 0.222, +1, 0.222], colNames) ] analysisResults = self.analyzer(analysisQuery) self.assertEqualStatResults(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "OutcomePredictionAnalysis.py", "-c", "-7", "-Q", "86400", "-o", "-33,-32,-31,-30", "-m", "0", "-R", "ItemAssociationRecommender", '0,-11111', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames) # Redo through prepared file intermediary sys.stdout = StringIO() argv = [ "PreparePatientItems.py", "-c", "-7", "-Q", "86400", "-V", "86400", "-o", "-33,-32,-31,-30", '0,-11111', "-" ] self.preparer.main(argv) preparedDataFile = StringIO(sys.stdout.getvalue()) sys.stdin = preparedDataFile # Read prepared data file from redirected stdin sys.stdout = StringIO() argv = [ "OutcomePredictionAnalysis.py", "-P", "-m", "0", "-R", "ItemAssociationRecommender", '-', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames) # Now try with time limitation on outcome measure analysisQuery.baseRecQuery.timeDeltaMax = timedelta(0, 604800) # 1 week colNames = [ "patient_id", "outcome.-33", "score.-33", "outcome.-32", "score.-32", "outcome.-31", "score.-31", "outcome.-30", "score.-30" ] expectedResults = [ RowItemModel([-11111, +0, 0.222, +2, 0.611, +0, 0.222, +1, 0.222], colNames) ] analysisResults = self.analyzer(analysisQuery) self.assertEqualStatResults(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "OutcomePredictionAnalysis.py", "-c", "-7", "-Q", "86400", "-t", "604800", "-o", "-33,-32,-31,-30", "-m", "0", "-R", "ItemAssociationRecommender", '0,-11111', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames) # Redo through prepared file intermediary sys.stdout = StringIO() argv = [ "PreparePatientItems.py", "-c", "-7", "-Q", "86400", "-V", "86400", "-t", "604800", "-o", "-33,-32,-31,-30", '0,-11111', "-" ] self.preparer.main(argv) preparedDataFile = StringIO(sys.stdout.getvalue()) sys.stdin = preparedDataFile # Read prepared data file from redirected stdin sys.stdout = StringIO() argv = [ "OutcomePredictionAnalysis.py", "-P", "-m", "0", "-R", "ItemAssociationRecommender", "-t", "604800", '-', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames) # Again, but with much stricter time limit (negative test case) analysisQuery.baseRecQuery.timeDeltaMax = timedelta(0, 172800) # 2 day colNames = [ "patient_id", "outcome.-33", "score.-33", "outcome.-32", "score.-32", "outcome.-31", "score.-31", "outcome.-30", "score.-30" ] expectedResults = [ RowItemModel([-11111, 0, 0.0109, 2, 0.0600, 0, 0.0109, 0, 0.0109], colNames) ] analysisResults = self.analyzer(analysisQuery) self.assertEqualStatResults(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "OutcomePredictionAnalysis.py", "-c", "-7", "-Q", "86400", "-t", "172800", "-o", "-33,-32,-31,-30", "-m", "0", "-R", "ItemAssociationRecommender", '0,-11111', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames) # Redo through prepared file intermediary sys.stdout = StringIO() argv = [ "PreparePatientItems.py", "-c", "-7", "-Q", "86400", "-V", "86400", "-t", "172800", "-o", "-33,-32,-31,-30", '0,-11111', "-" ] self.preparer.main(argv) preparedDataFile = StringIO(sys.stdout.getvalue()) sys.stdin = preparedDataFile # Read prepared data file from redirected stdin sys.stdout = StringIO() argv = [ "OutcomePredictionAnalysis.py", "-P", "-m", "0", "-R", "ItemAssociationRecommender", "-t", "172800", '-', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualStatResultsTextOutput(expectedResults, textOutput, colNames)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <recommenderName> <patientIds> [<outputFile>]\n"+\ " <patientIds/dataFile> Name of file with patient ids. If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for. Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\ " <outputFile> If query yields a result set, then that will be output\n"+\ " to the named file. Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-c", "--baseCategoryId", dest="baseCategoryId", help= "ID of clinical item category to look for initial items from (probably the ADMIT Dx item)." ) parser.add_option( "-Q", "--queryTimeSpan", dest="queryTimeSpan", help= "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above. Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)." ) parser.add_option( "-o", "--outcomeItemIds", dest="outcomeItemIds", help= "Comma separated list of outcome item IDs to get prediction / recommendation scores for, as well as to label whether they actually appeared for the given patients. Can specify virtual items representing the end of item triples (e.g., 5-Readmission being the end of any item followed by 3591-Discharge then 3671-Admit), by adding the component items in expected sequence. For example, '5=3591:3671'" ) parser.add_option( "-t", "--timeDeltaMax", dest="timeDeltaMax", help= "Time delta in seconds maximum by which recommendations should be based on. Defaults to recommending items that occur at ANY time after the key orders. If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items. If set, will also only count presence of labeled target items if occurs within the given time delta of the first query item." ) parser.add_option( "-P", "--preparedPatientItemFile", dest="preparedPatientItemFile", action="store_true", help= "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database." ) parser.add_option( "-R", "--recommender", dest="recommender", help= "Name of the recommender to run the analysis against. Options: %s" % list(RECOMMENDER_CLASS_BY_NAME.keys())) parser.add_option( "-S", "--scoreField", dest="scoreField", help= "Name of (derived) field to score items by. For example, 'conditionalFreq.'" ) parser.add_option( "-p", "--countPrefix", dest="countPrefix", help= "Which counting method to use for item associations. Defaults to counting item occurrences, allowing for duplicates. Additional options include: %s." % list(COUNT_PREFIX_OPTIONS)) parser.add_option( "-a", "--aggregationMethod", dest="aggregationMethod", help= "Aggregation method to use for recommendations based off multiple query items. Options: %s." % list(AGGREGATOR_OPTIONS)) parser.add_option( "-s", "--skipIfOutcomeInQuery", dest="skipIfOutcomeInQuery", action="store_true", help= "If set, will skip patients where the outcome item occurs during the query period since that would defy the point of predicting the outcome." ) parser.add_option( "-m", "--maxRecommendedId", dest="maxRecommendedId", help= "Specify a maximum ID value to accept for recommended items. More used to limit output in test cases" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) > 0: # Parse out the query parameters query = AnalysisQuery() query.recommender = RECOMMENDER_CLASS_BY_NAME[ options.recommender]() query.recommender.dataManager.dataCache = dict() # Use local cache to speed up repeat queries query.baseRecQuery = RecommenderQuery() if options.preparedPatientItemFile: # Don't reconstruct validation data through database, just read off validation file query.preparedPatientItemFile = stdOpen(args[0]) else: patientIdsParam = args[0] try: # Try to open patient IDs as a file patientIdFile = stdOpen(patientIdsParam) query.patientIds = set(patientIdFile.read().split()) except IOError: # Unable to open as a filename, then interpret as simple comma-separated list query.patientIds = set(patientIdsParam.split(",")) query.baseCategoryId = int(options.baseCategoryId) # Category to look for clinical item to start accruing query items from query.queryTimeSpan = timedelta(0, int(options.queryTimeSpan)) query.baseRecQuery.targetItemIds = set() outcomeIdStrList = options.outcomeItemIds.split(",") for outcomeIdStr in outcomeIdStrList: outcomeIdComponents = outcomeIdStr.split("=") outcomeId = int(outcomeIdComponents[0]) query.baseRecQuery.targetItemIds.add(outcomeId) if len(outcomeIdComponents) > 1: sequenceIds = [ int(seqIdStr) for seqIdStr in outcomeIdComponents[1].split(":") ] query.sequenceItemIdsByVirtualItemId[ outcomeId] = tuple(sequenceIds) if options.timeDeltaMax is not None: query.baseRecQuery.timeDeltaMax = timedelta( 0, int(options.timeDeltaMax)) if options.scoreField is not None: query.baseRecQuery.sortField = options.scoreField if options.countPrefix is not None: query.baseRecQuery.countPrefix = options.countPrefix if options.aggregationMethod is not None: query.baseRecQuery.aggregationMethod = options.aggregationMethod if options.maxRecommendedId is not None: query.baseRecQuery.maxRecommendedId = int( options.maxRecommendedId) if options.skipIfOutcomeInQuery is not None: query.skipIfOutcomeInQuery = options.skipIfOutcomeInQuery # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later print(COMMENT_TAG, json.dumps({"argv": argv}), file=outputFile) colNames = self.analysisHeaders(query) analysisResults.insert(0, RowItemModel(colNames, colNames)) # Insert a mock record to get a header / label row formatter = TextResultsFormatter(outputFile) formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <patientIds/dataFile> [<outputFile>]\n"+\ " <patientIds/dataFile> Name of file with patient ids. If not found, then interpret as comma-separated list of test Patient IDs to prepare analysis data for. Alternatively, provide preparedPatientItemFile generated from PreparePatientItems as input.\n"+\ " <outputFile> If query yields a result set, then that will be output\n"+\ " to the named file. Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-q", "--numQuery", dest="numQuery", help= "Number of orders / items from each patient to use as query items to prime the recommendations. If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items" ) parser.add_option( "-v", "--numVerify", dest="numVerify", help= "Number of orders / items from each patient after the query items to use to validate recommendations. If set to a float number in (0,1), then treat as a percentage of the patient's total orders / items. If left unset, then just use all remaining orders / items for that patient" ) parser.add_option( "-c", "--baseCategoryId", dest="baseCategoryId", help= "Instead of specifying first nQ query items, specify ID of clinical item category to look for initial items from (probably the ADMIT Dx item)." ) parser.add_option( "-b", "--baseItemId", dest="baseItemId", help= "Instead of specifying first nQ query items, specify ID of the specific clinical item to look for initial items from." ) parser.add_option( "-S", "--startDate", dest="startDate", help="Only look for test data occuring on or after this start date." ) parser.add_option( "-E", "--endDate", dest="endDate", help="Only look for test data occuring before this end date.") parser.add_option( "-Q", "--queryTimeSpan", dest="queryTimeSpan", help= "Time frame specified in seconds over which to look for initial query items (e.g., 24hrs = 86400) after the base item found from the category above. Start the time counting from the first item time occuring after the category item above since the ADMIT Dx items are often keyed to dates only without times (defaulting to midnight of the date specified)." ) parser.add_option( "-V", "--verifyTimeSpan", dest="verifyTimeSpan", help= "Time frame specified in seconds over which to look for verify items after initial query item time. Will ignore the query items that occur within the queryTimeSpan." ) parser.add_option( "-P", "--preparedPatientItemFile", dest="preparedPatientItemFile", action="store_true", help= "If set, will expect primary argument to instead be name of file to read input data from, instead of using above parameters to query from database." ) parser.add_option( "-R", "--recommender", dest="recommender", help= "Name of the recommender to run the analysis against. Options: %s" % list(RECOMMENDER_CLASS_BY_NAME.keys())) parser.add_option( "-r", "--numRecs", dest="numRecs", help= "Number of orders / items to recommend for comparison against the verification set. Alternative set option numRecsByOrderSet to look for key order set usage and size." ) parser.add_option( "-O", "--numRecsByOrderSet", dest="numRecsByOrderSet", action="store_true", help= "If set, then look for an order_set_id column to find the key order set that triggered the evaluation time point to determine number of recommendations to consider." ) parser.add_option( "-s", "--sortField", dest="sortField", help= "Allow overriding of default sort field when returning ranked results" ) parser.add_option( "-f", "--fieldFilters", dest="fieldFilters", help= "Filters to exclude results. Comma-separated separated list of field-op:value exclusions where op is either < or > like, conditionalFreq<:0.1,frqeRatio<:1" ) parser.add_option( "-t", "--timeDeltaMax", dest="timeDeltaMax", help= "If set, represents a time delta in seconds maximum by which recommendations should be based on. Defaults to recommending items that occur at ANY time after the key orders. If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items." ) parser.add_option( "-a", "--aggregationMethod", dest="aggregationMethod", help= "Aggregation method to use for recommendations based off multiple query items. Options: %s." % list(AGGREGATOR_OPTIONS)) parser.add_option( "-p", "--countPrefix", dest="countPrefix", help= "Prefix for how to do counts. Blank for default item counting allowing repeats, otherwise ignore repeats for patient_ or encounter_" ) parser.add_option( "-m", "--maxRecommendedId", dest="maxRecommendedId", help= "Specify a maximum ID value to accept for recommended items. More used to limit output in test cases" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) >= 1: # Parse out the query parameters query = AnalysisQuery() query.recommender = RECOMMENDER_CLASS_BY_NAME[ options.recommender]() query.recommender.dataManager.dataCache = dict() # Use a dataCache to facilitate repeat queries if options.preparedPatientItemFile: # Don't reconstruct validation data through database, just read off validation file query.preparedPatientItemFile = stdOpen(args[0]) else: patientIdsParam = args[0] try: # Try to open patient IDs as a file patientIdFile = stdOpen(patientIdsParam) query.patientIds = set(patientIdFile.read().split()) except IOError: # Unable to open as a filename, then interpret as simple comma-separated list query.patientIds = set(patientIdsParam.split(",")) if options.numQuery is not None: query.numQueryItems = int(options.numQuery) query.numVerifyItems = int(options.numVerify) else: # Alternative to specify query time span starting from a key category query.queryTimeSpan = timedelta(0, int(options.queryTimeSpan)) query.verifyTimeSpan = timedelta( 0, int(options.verifyTimeSpan)) if options.baseCategoryId is not None or options.baseItemId is not None: if options.baseCategoryId is not None: query.baseCategoryId = int(options.baseCategoryId) # Category to look for clinical item to start accruing query items from if options.baseItemId is not None: query.baseItemId = int(options.baseItemId) if options.startDate is not None: query.startDate = DBUtil.parseDateValue(options.startDate) if options.endDate is not None: query.endDate = DBUtil.parseDateValue(options.endDate) query.baseRecQuery = RecommenderQuery() query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds( ) query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds( ) if options.timeDeltaMax is not None and len( options.timeDeltaMax) > 0: query.baseRecQuery.timeDeltaMax = timedelta( 0, int(options.timeDeltaMax)) if options.aggregationMethod is not None: query.baseRecQuery.aggregationMethod = options.aggregationMethod if options.countPrefix is not None: query.baseRecQuery.countPrefix = options.countPrefix if options.maxRecommendedId is not None: query.baseRecQuery.maxRecommendedId = int( options.maxRecommendedId) if options.sortField is not None: query.baseRecQuery.sortField = options.sortField if options.fieldFilters is not None: for fieldFilterStr in options.fieldFilters.split(","): (fieldOp, valueStr) = fieldFilterStr.split(":") query.baseRecQuery.fieldFilters[fieldOp] = float(valueStr) if options.numRecs is not None: query.numRecommendations = int(options.numRecs) else: # No recommendation count specified, then just use the same as the verify number query.numRecommendations = query.numVerifyItems query.numRecsByOrderSet = options.numRecsByOrderSet # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) formatter = TextResultsFormatter(outputFile) colNames = self.resultHeaders(query) formatter.formatTuple(colNames) # Insert a mock record to get a header / label row formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def test_dataCache(self): # Test that repeating queries with cache turned on will not result in extra DB queries query = RecommenderQuery() query.countPrefix = "patient_" query.queryItemIds = set([-2, -5]) #query.excludeItemIds = set(); #query.categoryIds = set(); #query.timeDeltaMax = None; # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent. If left blank, will just consider all items within a given patient as co-occurrent. query.limit = 3 # Just get top 3 ranks for simplicity query.maxRecommendedId = 0 # Artificial constraint to focus only on test data headers = ["clinical_item_id", "conditionalFreq", "freqRatio"] # First query without cache self.recommender.dataManager.dataCache = None baselineData = self.recommender(query) baselineQueryCount = self.recommender.dataManager.queryCount # Redo query with cache self.recommender.dataManager.dataCache = dict() newData = self.recommender(query) newQueryCount = self.recommender.dataManager.queryCount self.assertEqualRecommendedData(baselineData, newData, query) # Ensure getting same results self.assertNotEqual(baselineQueryCount, newQueryCount) # Expect needed more queries since no prior cache baselineQueryCount = newQueryCount # Again, but should be no new query since have cached results last time newData = self.recommender(query) newQueryCount = self.recommender.dataManager.queryCount self.assertEqualRecommendedData(baselineData, newData, query) self.assertEqual(baselineQueryCount, newQueryCount) # Repeat multiple times, should still have no new query activity # prog = ProgressDots(10,1,"repeats"); for iRepeat in xrange(10): newData = self.recommender(query) newQueryCount = self.recommender.dataManager.queryCount self.assertEqualRecommendedData(baselineData, newData, query) self.assertEqual(baselineQueryCount, newQueryCount) # prog.update(); # prog.printStatus(); # Query for subset should still yield no new query query.queryItemIds = set([-2]) newData = self.recommender(query) newQueryCount = self.recommender.dataManager.queryCount baselineData = newData # New baseline for subset self.assertEqual(baselineQueryCount, newQueryCount) # Expect no queries for subsets # Repeat query for subset newData = self.recommender(query) newQueryCount = self.recommender.dataManager.queryCount self.assertEqualRecommendedData(baselineData, newData, query) self.assertEqual(baselineQueryCount, newQueryCount) # Expect no queries for subsets # Query for partial subset, partial new query.queryItemIds = set([-5, -6]) newData = self.recommender(query) newQueryCount = self.recommender.dataManager.queryCount baselineData = newData # New baseline for subset self.assertEqual(baselineQueryCount, newQueryCount) # Expect now new queries for subsets, because first query should have done mass-all query # Repeat for partial subset, no longer new newData = self.recommender(query) newQueryCount = self.recommender.dataManager.queryCount baselineData = newData # New baseline for subset self.assertEqualRecommendedData(baselineData, newData, query) self.assertEqual(baselineQueryCount, newQueryCount)
admitDxIdSectionGuidelineNameTuples = set() # Keep track of each guideline name set itemIdsByAdmitDxId = dict() for admitDxId, sectionName, guidelineName, itemId, itemName, itemDescription, itemCount in resultsTable: if admitDxId not in itemIdsByAdmitDxId: itemIdsByAdmitDxId[admitDxId] = set() itemIdsByAdmitDxId[admitDxId].add(itemId) admitDxIdSectionGuidelineNameTuples.add( (admitDxId, sectionName, guidelineName)) recommender = ItemAssociationRecommender() for admitDxId, itemIds in itemIdsByAdmitDxId.iteritems(): print >> sys.stderr, admitDxId, len(itemIds) recQuery = RecommenderQuery() recQuery.excludeItemIds = recommender.defaultExcludedClinicalItemIds() recQuery.excludeCategoryIds = recommender.defaultExcludedClinicalItemCategoryIds( ) recQuery.queryItemIds = [admitDxId] recQuery.timeDeltaMax = timedelta(1) # Within one day recQuery.countPrefix = "patient_" recQuery.limit = TOP_ITEM_COUNT # Top results by P-value recQuery.sortField = "P-YatesChi2-NegLog" results = recommender(recQuery) #recommender.formatRecommenderResults(results); for result in results: itemIds.add(result["clinical_item_id"])
def test_recommender(self): # Run the recommender against the mock test data above and verify expected stats afterwards. query = RecommenderQuery() #query.queryItemIds = set(); #query.excludeItemIds = set(); #query.categoryIds = set(); #query.timeDeltaMax = None; # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent. If left blank, will just consider all items within a given patient as co-occurrent. query.sortField = "tf" query.limit = 16 # Go ahead and query for all since short list and can get expected calculation results for all query.maxRecommendedId = 0 # Artificial constraint to focus only on test data log.debug( "Query with no item key input, just return ranks by general likelihood then." ) headers = ["clinical_item_id", "score"] expectedData = \ [ RowItemModel( [-2, 2.0/13], headers ), RowItemModel( [-5, 2.0/13], headers ), RowItemModel( [-6, 2.0/13], headers ), RowItemModel( [-1, 1.0/13], headers ), RowItemModel( [-3, 1.0/13], headers ), RowItemModel( [-7, 1.0/13], headers ), RowItemModel( [-8, 1.0/13], headers ), RowItemModel( [-10,1.0/13], headers ), RowItemModel( [-11,1.0/13], headers ), RowItemModel( [-12,1.0/13], headers ), RowItemModel( [-13,1.0/13], headers ), RowItemModel( [-14,1.0/13], headers ), RowItemModel( [-15,1.0/13], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug( "Query with key item inputs for which no data exists. Effecitvely ignore it then, so just return ranks by general likelihood." ) query.queryItemIds = set([-100]) expectedData = \ [ RowItemModel( [-2, 2.0/13], headers ), RowItemModel( [-5, 2.0/13], headers ), RowItemModel( [-6, 2.0/13], headers ), RowItemModel( [-1, 1.0/13], headers ), RowItemModel( [-3, 1.0/13], headers ), RowItemModel( [-7, 1.0/13], headers ), RowItemModel( [-8, 1.0/13], headers ), RowItemModel( [-10,1.0/13], headers ), RowItemModel( [-11,1.0/13], headers ), RowItemModel( [-12,1.0/13], headers ), RowItemModel( [-13,1.0/13], headers ), RowItemModel( [-14,1.0/13], headers ), RowItemModel( [-15,1.0/13], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug("Query with category filter on recommended results.") query.queryItemIds = set([-100]) query.excludeCategoryIds = set([-1, -4, -5, -6]) expectedData = \ [ #RowItemModel( [-2, 2.0/13], headers ), RowItemModel( [-5, 2.0/13], headers ), RowItemModel( [-6, 2.0/13], headers ), #RowItemModel( [-1, 1.0/13], headers ), #RowItemModel( [-3, 1.0/13], headers ), RowItemModel( [-7, 1.0/13], headers ), RowItemModel( [-8, 1.0/13], headers ), RowItemModel( [-10,1.0/13], headers ), RowItemModel( [-11,1.0/13], headers ), RowItemModel( [-12,1.0/13], headers ), RowItemModel( [-13,1.0/13], headers ), #RowItemModel( [-14,1.0/13], headers ), #RowItemModel( [-15,1.0/13], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug( "Query with category filter and specific exclusion filter on recommended results." ) query.queryItemIds = set([-100]) query.excludeItemIds = set([-6, -10]) query.excludeCategoryIds = set([-1, -4, -5, -6]) expectedData = \ [ #RowItemModel( [-2, 2.0/13], headers ), RowItemModel( [-5, 2.0/13], headers ), #RowItemModel( [-6, 2.0/13], headers ), #RowItemModel( [-1, 1.0/13], headers ), #RowItemModel( [-3, 1.0/13], headers ), RowItemModel( [-7, 1.0/13], headers ), RowItemModel( [-8, 1.0/13], headers ), #RowItemModel( [-10,1.0/13], headers ), RowItemModel( [-11,1.0/13], headers ), RowItemModel( [-12,1.0/13], headers ), RowItemModel( [-13,1.0/13], headers ), #RowItemModel( [-14,1.0/13], headers ), #RowItemModel( [-15,1.0/13], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug( "General query with a couple of input clinical items + one with no association data (should effectively be ignored)." ) query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set() query.excludeCategoryIds = set() expectedData = \ [ RowItemModel( [-6, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ), #RowItemModel( [-5, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ), #RowItemModel( [-2, (1.0/6)*(1.0/2)+(1.0/6)*(2.0/2)], headers ), RowItemModel( [-3, (1.0/6)*(2.0/2)], headers ), RowItemModel( [-7, (1.0/6)*(2.0/2)], headers ), RowItemModel( [-8, (1.0/6)*(2.0/2)], headers ), RowItemModel( [-14,(1.0/4)*(1.0/2)], headers ), RowItemModel( [-15,(1.0/4)*(1.0/2)], headers ), RowItemModel( [-1, (1.0/6)*(1.0/2)], headers ), RowItemModel( [-10,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-11,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-12,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-13,(1.0/6)*(1.0/2)], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug("General query with category limit") query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set() query.excludeCategoryIds = set([-2, -4, -5, -6]) expectedData = \ [ #RowItemModel( [-6, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ), #RowItemModel( [-5, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ), #RowItemModel( [-2, (1.0/6)*(1.0/2)+(1.0/6)*(2.0/2)], headers ), RowItemModel( [-3, (1.0/6)*(2.0/2)], headers ), #RowItemModel( [-7, (1.0/6)*(2.0/2)], headers ), #RowItemModel( [-8, (1.0/6)*(2.0/2)], headers ), #RowItemModel( [-14,(1.0/4)*(1.0/2)], headers ), #RowItemModel( [-15,(1.0/4)*(1.0/2)], headers ), RowItemModel( [-1, (1.0/6)*(1.0/2)], headers ), RowItemModel( [-10,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-11,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-12,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-13,(1.0/6)*(1.0/2)], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug("General query with specific exclusion") query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set([-4, -3, -2]) query.excludeCategoryIds = set() expectedData = \ [ RowItemModel( [-6, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ), #RowItemModel( [-5, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ), #RowItemModel( [-2, (1.0/6)*(1.0/2)+(1.0/6)*(2.0/2)], headers ), #RowItemModel( [-3, (1.0/6)*(2.0/2)], headers ), RowItemModel( [-7, (1.0/6)*(2.0/2)], headers ), RowItemModel( [-8, (1.0/6)*(2.0/2)], headers ), RowItemModel( [-14,(1.0/4)*(1.0/2)], headers ), RowItemModel( [-15,(1.0/4)*(1.0/2)], headers ), RowItemModel( [-1, (1.0/6)*(1.0/2)], headers ), RowItemModel( [-10,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-11,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-12,(1.0/6)*(1.0/2)], headers ), RowItemModel( [-13,(1.0/6)*(1.0/2)], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug("General query, sort by TF*IDF lift.") query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set() query.excludeCategoryIds = set() query.sortField = "lift" expectedData = \ [ #RowItemModel( [-5, (13.0/2)*((1.0/6)*(2.0/2)+(1.0/4)*(1.0/2))], headers ), #RowItemModel( [-2, (13.0/2)*((1.0/6)*(1.0/2)+(1.0/6)*(2.0/2))], headers ), RowItemModel( [-3, (13.0/1)*((1.0/6)*(2.0/2))], headers ), RowItemModel( [-7, (13.0/1)*((1.0/6)*(2.0/2))], headers ), RowItemModel( [-8, (13.0/1)*((1.0/6)*(2.0/2))], headers ), RowItemModel( [-6, (13.0/2)*((1.0/6)*(2.0/2)+(1.0/4)*(1.0/2))], headers ), RowItemModel( [-14,(13.0/1)*((1.0/4)*(1.0/2))], headers ), RowItemModel( [-15,(13.0/1)*((1.0/4)*(1.0/2))], headers ), RowItemModel( [-1, (13.0/1)*((1.0/6)*(1.0/2))], headers ), RowItemModel( [-10,(13.0/1)*((1.0/6)*(1.0/2))], headers ), RowItemModel( [-11,(13.0/1)*((1.0/6)*(1.0/2))], headers ), RowItemModel( [-12,(13.0/1)*((1.0/6)*(1.0/2))], headers ), RowItemModel( [-13,(13.0/1)*((1.0/6)*(1.0/2))], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query)
def test_recommender(self): # Run the recommender against the mock test data above and verify expected stats afterwards. query = RecommenderQuery() #query.queryItemIds = set(); #query.excludeItemIds = set(); #query.categoryIds = set(); #query.timeDeltaMax = None; # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent. If left blank, will just consider all items within a given patient as co-occurrent. query.limit = 3 # Just get top 3 ranks for simplicity query.maxRecommendedId = 0 # Artificial constraint to focus only on test data log.debug( "Query with no item key input, just return ranks by general likelihood then." ) headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-3], headers ), RowItemModel( [-6], headers ), RowItemModel( [-5], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug( "Query with key item inputs for which no data exists. Effecitvely ignore it then, so just return ranks by general likelihood." ) query.queryItemIds = set([-100]) headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-3], headers ), RowItemModel( [-6], headers ), RowItemModel( [-5], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug("Query with category filter on recommended results.") query.queryItemIds = set([-100]) query.excludeCategoryIds = set([-1, -4, -5, -6]) headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-6], headers ), RowItemModel( [-5], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug( "Query with category filter and specific exclusion filter on recommended results." ) query.queryItemIds = set([-100]) query.excludeItemIds = set([-6]) query.excludeCategoryIds = set([-1, -4, -5, -6]) headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-5], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug( "General query with a couple of input clinical items + one with no association data (should effectively be ignored)." ) query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set() query.excludeCategoryIds = set() headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-4], headers ), RowItemModel( [-6], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug( "General query but set a limit on time delta worth counting item associations" ) query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set() query.excludeCategoryIds = set() query.timeDeltaMax = DELTA_HOUR headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-6], headers ), RowItemModel( [-4], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug("General query with category limit") query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set() query.excludeCategoryIds = set([-2, -4, -5, -6]) query.timeDeltaMax = DELTA_HOUR headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-4], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query) log.debug("General query with specific exclusion") query.queryItemIds = set([-2, -5, -100]) query.excludeItemIds = set([-4, -3, -2]) query.excludeCategoryIds = set() query.timeDeltaMax = DELTA_HOUR headers = ["clinical_item_id"] expectedData = \ [ RowItemModel( [-6], headers ), ] recommendedData = self.recommender(query) self.assertEqualRecommendedData(expectedData, recommendedData, query)
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <inputFile> [<outputFile>]\n"+\ " <inputFile> Validation file in prepared result file format. Predict items and compare against verify sets similar to RecommendationClassficationAnalysis. \n"+\ " <outputFile> Validation result stat summaries.\n" parser = OptionParser(usage=usageStr) parser.add_option( "-X", "--excludeCategoryIds", dest="excludeCategoryIds", help= "For recommendation, exclude / skip any items who fall under one of these comma-separated category Ids." ) parser.add_option( "-s", "--sortField", dest="sortField", default=DEFAULT_SORT_FIELD, help= "Score field to sort top recommendations by. Default to posterior probabilty / positive predictive value 'P(B|A)', but can also select 'lift' = 'tfidf' = 'interest' for TF*IDF style score weighting." ) parser.add_option( "-r", "--numRecs", dest="numRecs", default=DEFAULT_RECOMMENDED_ITEM_COUNT, help= "Number of orders / items to recommend for comparison against the verification set." ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() if len(args) >= 1: query = AnalysisQuery() query.preparedPatientItemFile = stdOpen(args[0]) query.recommender = OrderSetRecommender() query.baseRecQuery = RecommenderQuery() if options.excludeCategoryIds is not None: query.baseRecQuery.excludeCategoryIds = set() for categoryIdStr in options.executeCategoryIds.split(","): query.baseRecQuery.excludeCategoryIds.add( int(categoryIdStr)) else: # Default exclusions if none specified query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds( ) query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds( ) query.baseRecQuery.sortField = options.sortField query.numRecommendations = int(options.numRecs) # Run the actual analysis analysisResults = self(query) # Format the results for output outputFilename = None if len(args) > 1: outputFilename = args[1] outputFile = stdOpen(outputFilename, "w") # Print comment line with analysis arguments to allow for deconstruction later summaryData = { "argv": argv } print(COMMENT_TAG, json.dumps(summaryData), file=outputFile) formatter = TextResultsFormatter(outputFile) colNames = self.resultHeaders(query) formatter.formatTuple(colNames) # Insert a mock record to get a header / label row formatter.formatResultDicts(analysisResults, colNames) else: parser.print_help() sys.exit(-1) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def test_analysisPreparation(self): # Run the analysis preparer against the mock test data above and verify expected data afterwards. analysisQuery = AnalysisQuery() analysisQuery.patientIds = set([-11111]) analysisQuery.baseCategoryId = -7 analysisQuery.queryTimeSpan = timedelta(0, 86400) analysisQuery.verifyTimeSpan = timedelta(0, 604800) analysisQuery.baseRecQuery = RecommenderQuery() analysisQuery.baseRecQuery.targetItemIds = set([-33, -32, -31, -30]) analysisQuery.baseRecQuery.excludeItemIds = [-13] analysisQuery.baseRecQuery.excludeCategoryIds = [-5] analysisQuery.baseRecQuery.maxRecommendedId = 0 # Restrict to test data # Initial run without time limits on outcome measure colNames = [ "patient_id", "baseItemId", "queryItemCountById", "verifyItemCountById", "outcome.-33", "outcome.-32", "outcome.-31", "outcome.-30" ] expectedResults = [ RowItemModel([ -11111, -21, { -4: 2, -10: 1, -8: 1, -32: 1 }, { -30: 1 }, +0, +2, +1, +1 ], colNames) ] analysisResults = list(self.analyzer(analysisQuery)) self.assertEqualResultDicts(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "PreparePatientItems.py", "-c", "-7", "-Q", "86400", "-V", "604800", "-o", "-33,-32,-31,-30", '0,-11111', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualTextOutput(expectedResults, textOutput, colNames) # Now try with time limitation on outcome measure analysisQuery.baseRecQuery.timeDeltaMax = timedelta(0, 604800) # 1 week colNames = [ "patient_id", "queryItemCountById", "verifyItemCountById", "outcome.-33", "outcome.-32", "outcome.-30" ] expectedResults = [ RowItemModel( [-11111, { -4: 2, -10: 1, -8: 1, -32: 1 }, { -30: 1 }, +0, +2, +1], colNames) ] analysisResults = list(self.analyzer(analysisQuery)) self.assertEqualResultDicts(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "PreparePatientItems.py", "-c", "-7", "-Q", "86400", "-V", "604800", "-t", "604800", "-o", "-33,-32,-31,-30", '0,-11111', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualTextOutput(expectedResults, textOutput, colNames) # Now include all historical (demographic) data items analysisQuery.pastCategoryIds = [-8] colNames = [ "patient_id", "queryItemCountById", "verifyItemCountById", "outcome.-33", "outcome.-32", "outcome.-30" ] expectedResults = [ RowItemModel([ -11111, { -43: 1, -45: 1, -41: 1, -46: 1, -4: 2, -10: 1, -8: 1, -32: 1 }, { -30: 1 }, +0, +2, +1 ], colNames) ] analysisResults = list(self.analyzer(analysisQuery)) self.assertEqualResultDicts(expectedResults, analysisResults, colNames) # Redo but run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "PreparePatientItems.py", "-p", "-8", "-c", "-7", "-Q", "86400", "-V", "604800", "-t", "604800", "-o", "-33,-32,-31,-30", '0,-11111', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualTextOutput(expectedResults, textOutput, colNames) # Different search where items are recorded with date level instead of time level precision # Note that use a verify time threshold of (1 day + 1 second) so can just capture the next day of of data instead of just missing it colNames = [ "patient_id", "baseItemId", "queryItemCountById", "verifyItemCountById", "outcome.-33", "outcome.-32", "outcome.-31", "outcome.-30" ] expectedResults = [ RowItemModel([ -44444, -21, { -6: 1, -12: 1 }, { -11: 1, -8: 1 }, +0, +0, +0, +0 ], colNames) ] # Run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "PreparePatientItems.py", "-c", "-7", "-Q", "14400", "-V", "86401", "-o", "-33,-32,-31,-30", '0,-44444', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualTextOutput(expectedResults, textOutput, colNames) # Include background demographics category colNames = [ "patient_id", "baseItemId", "queryItemCountById", "verifyItemCountById", "outcome.-33", "outcome.-32", "outcome.-31", "outcome.-30" ] expectedResults = [ RowItemModel([ -44444, -21, { -43: 1, -45: 1, -42: 1, -46: 1, -6: 1, -12: 1 }, { -11: 1, -8: 1 }, +0, +0, +0, +0 ], colNames) ] # Run through command-line interface sys.stdout = StringIO() # Redirect stdout output to collect test results argv = [ "PreparePatientItems.py", "-p", "-8", "-c", "-7", "-Q", "14400", "-V", "86401", "-o", "-33,-32,-31,-30", '0,-44444', "-" ] self.analyzer.main(argv) textOutput = StringIO(sys.stdout.getvalue()) self.assertEqualTextOutput(expectedResults, textOutput, colNames)
diagnoses = open('/Users/jwang/Desktop/Results/diagnoses_to_test.csv', "rU") diagnoses.readline() baseQueryStr = "&targetItemIds=&excludeItemIds=71052,71046,71054,71083,71045,71047&excludeCategoryIds=1,58,4,2,160,161,59,13,159,163,23,62,18,11,46,2&timeDeltaMax=86400&sortField=P-YatesChi2-NegLog&sortReverse=True&filterField1=prevalence<:&filterField2=PPV<:&filterField3=RR<:&filterField4=sensitivity<:&filterField5=P-YatesChi2<:&resultCount=4000&invertQuery=false&showCounts=true&countPrefix=patient_&aggregationMethod=weighted&cacheTime=0" recommender = ItemAssociationRecommender() diagnosis_count = 0 for line in diagnoses: line = line.strip().split(",") clinical_item_id = line[0] description = " ".join(line[1:]) queryStr = "queryItemIds=" + str(clinical_item_id) + baseQueryStr print('Finding Top Associations for "{0}"'.format(description)) # Build RecommenderQuery query = RecommenderQuery() paramDict = dict(urlparse.parse_qsl(queryStr, True)) query.parseParams(paramDict) # Call ItemRecommender recommendations = recommender(query) # Output to csv file description = description.replace("/", ";") fname = str(clinical_item_id) + " " + str(description) + ".csv" outfname = open( "/Users/jwang/Desktop/Results/item_associations_expert_unmatched/" + fname, "w") outfname.write( "clinical_item_id,description,score,PPV,OR,prevalence,RR,P-YatesChi2\n" )
def main(self, argv): """Main method, callable from command line""" usageStr = "usage: %prog [options] <patientIds> [<outputFile>]\n"+\ " <patientIds> Patient ID file or Comma-separated list of test Patient IDs to run analysis against\n"+\ " <outputFile> If query yields a result set, then that will be output\n"+\ " to the named file. Leave blank or specify \"-\" to send to stdout.\n" parser = OptionParser(usage=usageStr) parser.add_option("-R", "--recommender", dest="recommender", help="Name of the recommender to run the analysis against. Options: %s" % RECOMMENDER_CLASS_BY_NAME.keys()); parser.add_option("-s", "--sortField", dest="sortField", help="Allow overriding of default sort field when returning ranked results"); parser.add_option("-f", "--fieldFilters", dest="fieldFilters", help="Filters to exclude results. Comma-separated separated list of field-op:value exclusions where op is either < or > like, conditionalFreq<:0.1,frqeRatio<:1"); parser.add_option("-t", "--timeDeltaMax", dest="timeDeltaMax", help="If set, represents a time delta in seconds maximum by which recommendations should be based on. Defaults to recommending items that occur at ANY time after the key orders. If provided, will apply limits to only orders placed within 0 seconds, 1 hour (3600), 1 day (86400), or 1 week (604800) of the key orders / items."); parser.add_option("-a", "--aggregationMethod", dest="aggregationMethod", help="Aggregation method to use for recommendations based off multiple query items. Options: %s." % list(AGGREGATOR_OPTIONS) ); parser.add_option("-p", "--countPrefix", dest="countPrefix", help="Prefix for how to do counts. Blank for default item counting allowing repeats, otherwise ignore repeats for patient_ or encounter_"); parser.add_option("-q", "--queryItemMax", dest="queryItemMax", help="If set, specifies a maximum number of query items to use when analyzing serial recommendations. Will stop analyzing further for a patient once reach this limit."); (options, args) = parser.parse_args(argv[1:]) log.info("Starting: "+str.join(" ", argv)) timer = time.time(); if len(args) > 1: # Parse out the query parameters query = AnalysisQuery(); query.recommender = RECOMMENDER_CLASS_BY_NAME[options.recommender](); query.recommender.dataManager.dataCache = dict(); # Use a local cahce to speed up repeat queries patientIdsParam = args[0]; try: # Try to open patient IDs as a file patientIdFile = stdOpen(patientIdsParam); query.patientIds = set( patientIdFile.read().split() ); except IOError: # Unable to open as a filename, then interpret as simple comma-separated list query.patientIds = set(patientIdsParam.split(",")); query.baseRecQuery = RecommenderQuery(); query.baseRecQuery.excludeCategoryIds = query.recommender.defaultExcludedClinicalItemCategoryIds(); query.baseRecQuery.excludeItemIds = query.recommender.defaultExcludedClinicalItemIds(); if options.sortField is not None: query.baseRecQuery.sortField = options.sortField; if options.fieldFilters is not None: for fieldFilterStr in options.fieldFilters.split(","): (fieldOp, valueStr) = fieldFilterStr.split(":"); query.baseRecQuery.fieldFilters[fieldOp] = float(valueStr); if options.timeDeltaMax is not None and len(options.timeDeltaMax) > 0: query.baseRecQuery.timeDeltaMax = timedelta(0,int(options.timeDeltaMax)); if options.aggregationMethod is not None: query.baseRecQuery.aggregationMethod = options.aggregationMethod; if options.countPrefix is not None: query.baseRecQuery.countPrefix = options.countPrefix; if options.queryItemMax is not None: query.queryItemMax = int(options.queryItemMax); # Run the actual analysis analysisResults = self(query); # Format the results for output outputFilename = None; if len(args) > 1: outputFilename = args[1]; outputFile = stdOpen(outputFilename,"w"); print >> outputFile, "#", argv; # Print comment line with analysis arguments to allow for deconstruction later colNames = ["patientId", "clinicalItemId", "iItem", "iRecItem", "recRank", "recScore"]; analysisResults.insert(0, colNames); # Insert a mock record to get a header / label row formatter = TextResultsFormatter( outputFile ); formatter.formatResultSet( analysisResults ); else: parser.print_help() sys.exit(-1) timer = time.time() - timer; log.info("%.3f seconds to complete",timer);