Пример #1
0
    def test_recommender_aggregation(self):
        # Test different scoring aggregation methods

        query = RecommenderQuery()
        query.countPrefix = "patient_"
        query.queryItemIds = set([-2, -5])
        #query.excludeItemIds = set();
        #query.categoryIds = set();
        #query.timeDeltaMax = None;   # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent.  If left blank, will just consider all items within a given patient as co-occurrent.
        query.limit = 3
        # Just get top 3 ranks for simplicity
        query.maxRecommendedId = 0
        # Artificial constraint to focus only on test data

        headers = ["clinical_item_id", "conditionalFreq", "freqRatio"]

        # Default weighted aggregation method
        expectedData = \
            [   RowItemModel( [-4, 0.3,    22.5], headers ),
                RowItemModel( [-6, 0.16667, 7.142857], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        # Change to unweighted aggregation method
        query.aggregationMethod = "unweighted"
        expectedData = \
            [   RowItemModel( [-4, 0.32857, 24.64286], headers ),
                RowItemModel( [-6, 0.16667,  7.142857], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        # Change to Serial Bayes aggregation method
        query.aggregationMethod = "SerialBayes"
        expectedData = \
            [   RowItemModel( [-4, 0.89157, 66.867471], headers ),
                RowItemModel( [-6, 0.16667,  7.142857], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        # Naive Bayes aggregation
        query.aggregationMethod = "NaiveBayes"
        expectedData = \
            [   RowItemModel( [-4, 3.75,   281.25], headers ),      # Without truncating negative values
                #RowItemModel( [-4, 0.8,    58.59707], headers ),   # With truncating negative values
                RowItemModel( [-6, 0.16667, 7.142857], headers ),
            ]
        recommendedData = self.recommender(query)

        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        # Apply value filter
        query.fieldFilters["freqRatio>"] = 10.0
        expectedData = \
            [   RowItemModel( [-6, 0.16667, 7.142857], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)
Пример #2
0
    def test_recommender(self):
        # Run the recommender against the mock test data above and verify expected stats afterwards.

        query = RecommenderQuery()
        #query.queryItemIds = set();
        #query.excludeItemIds = set();
        #query.categoryIds = set();
        #query.timeDeltaMax = None;   # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent.  If left blank, will just consider all items within a given patient as co-occurrent.
        query.sortField = "tf"
        query.limit = 16
        # Go ahead and query for all since short list and can get expected calculation results for all
        query.maxRecommendedId = 0
        # Artificial constraint to focus only on test data

        log.debug(
            "Query with no item key input, just return ranks by general likelihood then."
        )
        headers = ["clinical_item_id", "score"]
        expectedData = \
            [   RowItemModel( [-2, 2.0/13], headers ),
                RowItemModel( [-5, 2.0/13], headers ),
                RowItemModel( [-6, 2.0/13], headers ),
                RowItemModel( [-1, 1.0/13], headers ),
                RowItemModel( [-3, 1.0/13], headers ),
                RowItemModel( [-7, 1.0/13], headers ),
                RowItemModel( [-8, 1.0/13], headers ),
                RowItemModel( [-10,1.0/13], headers ),
                RowItemModel( [-11,1.0/13], headers ),
                RowItemModel( [-12,1.0/13], headers ),
                RowItemModel( [-13,1.0/13], headers ),
                RowItemModel( [-14,1.0/13], headers ),
                RowItemModel( [-15,1.0/13], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug(
            "Query with key item inputs for which no data exists.  Effecitvely ignore it then, so just return ranks by general likelihood."
        )
        query.queryItemIds = set([-100])
        expectedData = \
            [   RowItemModel( [-2, 2.0/13], headers ),
                RowItemModel( [-5, 2.0/13], headers ),
                RowItemModel( [-6, 2.0/13], headers ),
                RowItemModel( [-1, 1.0/13], headers ),
                RowItemModel( [-3, 1.0/13], headers ),
                RowItemModel( [-7, 1.0/13], headers ),
                RowItemModel( [-8, 1.0/13], headers ),
                RowItemModel( [-10,1.0/13], headers ),
                RowItemModel( [-11,1.0/13], headers ),
                RowItemModel( [-12,1.0/13], headers ),
                RowItemModel( [-13,1.0/13], headers ),
                RowItemModel( [-14,1.0/13], headers ),
                RowItemModel( [-15,1.0/13], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug("Query with category filter on recommended results.")
        query.queryItemIds = set([-100])
        query.excludeCategoryIds = set([-1, -4, -5, -6])
        expectedData = \
            [   #RowItemModel( [-2, 2.0/13], headers ),
                RowItemModel( [-5, 2.0/13], headers ),
                RowItemModel( [-6, 2.0/13], headers ),
                #RowItemModel( [-1, 1.0/13], headers ),
                #RowItemModel( [-3, 1.0/13], headers ),
                RowItemModel( [-7, 1.0/13], headers ),
                RowItemModel( [-8, 1.0/13], headers ),
                RowItemModel( [-10,1.0/13], headers ),
                RowItemModel( [-11,1.0/13], headers ),
                RowItemModel( [-12,1.0/13], headers ),
                RowItemModel( [-13,1.0/13], headers ),
                #RowItemModel( [-14,1.0/13], headers ),
                #RowItemModel( [-15,1.0/13], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug(
            "Query with category filter and specific exclusion filter on recommended results."
        )
        query.queryItemIds = set([-100])
        query.excludeItemIds = set([-6, -10])
        query.excludeCategoryIds = set([-1, -4, -5, -6])
        expectedData = \
            [   #RowItemModel( [-2, 2.0/13], headers ),
                RowItemModel( [-5, 2.0/13], headers ),
                #RowItemModel( [-6, 2.0/13], headers ),
                #RowItemModel( [-1, 1.0/13], headers ),
                #RowItemModel( [-3, 1.0/13], headers ),
                RowItemModel( [-7, 1.0/13], headers ),
                RowItemModel( [-8, 1.0/13], headers ),
                #RowItemModel( [-10,1.0/13], headers ),
                RowItemModel( [-11,1.0/13], headers ),
                RowItemModel( [-12,1.0/13], headers ),
                RowItemModel( [-13,1.0/13], headers ),
                #RowItemModel( [-14,1.0/13], headers ),
                #RowItemModel( [-15,1.0/13], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug(
            "General query with a couple of input clinical items + one with no association data (should effectively be ignored)."
        )
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set()
        query.excludeCategoryIds = set()
        expectedData = \
            [   RowItemModel( [-6, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ),
                #RowItemModel( [-5, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ),
                #RowItemModel( [-2, (1.0/6)*(1.0/2)+(1.0/6)*(2.0/2)], headers ),

                RowItemModel( [-3, (1.0/6)*(2.0/2)], headers ),
                RowItemModel( [-7, (1.0/6)*(2.0/2)], headers ),
                RowItemModel( [-8, (1.0/6)*(2.0/2)], headers ),

                RowItemModel( [-14,(1.0/4)*(1.0/2)], headers ),
                RowItemModel( [-15,(1.0/4)*(1.0/2)], headers ),

                RowItemModel( [-1, (1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-10,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-11,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-12,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-13,(1.0/6)*(1.0/2)], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug("General query with category limit")
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set()
        query.excludeCategoryIds = set([-2, -4, -5, -6])
        expectedData = \
            [   #RowItemModel( [-6, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ),
                #RowItemModel( [-5, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ),
                #RowItemModel( [-2, (1.0/6)*(1.0/2)+(1.0/6)*(2.0/2)], headers ),

                RowItemModel( [-3, (1.0/6)*(2.0/2)], headers ),
                #RowItemModel( [-7, (1.0/6)*(2.0/2)], headers ),
                #RowItemModel( [-8, (1.0/6)*(2.0/2)], headers ),

                #RowItemModel( [-14,(1.0/4)*(1.0/2)], headers ),
                #RowItemModel( [-15,(1.0/4)*(1.0/2)], headers ),

                RowItemModel( [-1, (1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-10,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-11,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-12,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-13,(1.0/6)*(1.0/2)], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug("General query with specific exclusion")
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set([-4, -3, -2])
        query.excludeCategoryIds = set()
        expectedData = \
            [   RowItemModel( [-6, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ),
                #RowItemModel( [-5, (1.0/6)*(2.0/2)+(1.0/4)*(1.0/2)], headers ),
                #RowItemModel( [-2, (1.0/6)*(1.0/2)+(1.0/6)*(2.0/2)], headers ),

                #RowItemModel( [-3, (1.0/6)*(2.0/2)], headers ),
                RowItemModel( [-7, (1.0/6)*(2.0/2)], headers ),
                RowItemModel( [-8, (1.0/6)*(2.0/2)], headers ),

                RowItemModel( [-14,(1.0/4)*(1.0/2)], headers ),
                RowItemModel( [-15,(1.0/4)*(1.0/2)], headers ),

                RowItemModel( [-1, (1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-10,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-11,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-12,(1.0/6)*(1.0/2)], headers ),
                RowItemModel( [-13,(1.0/6)*(1.0/2)], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug("General query, sort by TF*IDF lift.")
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set()
        query.excludeCategoryIds = set()
        query.sortField = "lift"
        expectedData = \
            [   #RowItemModel( [-5, (13.0/2)*((1.0/6)*(2.0/2)+(1.0/4)*(1.0/2))], headers ),
                #RowItemModel( [-2, (13.0/2)*((1.0/6)*(1.0/2)+(1.0/6)*(2.0/2))], headers ),

                RowItemModel( [-3, (13.0/1)*((1.0/6)*(2.0/2))], headers ),
                RowItemModel( [-7, (13.0/1)*((1.0/6)*(2.0/2))], headers ),
                RowItemModel( [-8, (13.0/1)*((1.0/6)*(2.0/2))], headers ),

                RowItemModel( [-6, (13.0/2)*((1.0/6)*(2.0/2)+(1.0/4)*(1.0/2))], headers ),

                RowItemModel( [-14,(13.0/1)*((1.0/4)*(1.0/2))], headers ),
                RowItemModel( [-15,(13.0/1)*((1.0/4)*(1.0/2))], headers ),

                RowItemModel( [-1, (13.0/1)*((1.0/6)*(1.0/2))], headers ),
                RowItemModel( [-10,(13.0/1)*((1.0/6)*(1.0/2))], headers ),
                RowItemModel( [-11,(13.0/1)*((1.0/6)*(1.0/2))], headers ),
                RowItemModel( [-12,(13.0/1)*((1.0/6)*(1.0/2))], headers ),
                RowItemModel( [-13,(13.0/1)*((1.0/6)*(1.0/2))], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)
Пример #3
0
    def test_dataCache(self):
        # Test that repeating queries with cache turned on will not result in extra DB queries
        query = RecommenderQuery()
        query.countPrefix = "patient_"
        query.queryItemIds = set([-2, -5])
        #query.excludeItemIds = set();
        #query.categoryIds = set();
        #query.timeDeltaMax = None;   # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent.  If left blank, will just consider all items within a given patient as co-occurrent.
        query.limit = 3
        # Just get top 3 ranks for simplicity
        query.maxRecommendedId = 0
        # Artificial constraint to focus only on test data

        headers = ["clinical_item_id", "conditionalFreq", "freqRatio"]

        # First query without cache
        self.recommender.dataManager.dataCache = None
        baselineData = self.recommender(query)
        baselineQueryCount = self.recommender.dataManager.queryCount

        # Redo query with cache
        self.recommender.dataManager.dataCache = dict()
        newData = self.recommender(query)
        newQueryCount = self.recommender.dataManager.queryCount
        self.assertEqualRecommendedData(baselineData, newData, query)
        # Ensure getting same results
        self.assertNotEqual(baselineQueryCount, newQueryCount)
        # Expect needed more queries since no prior cache
        baselineQueryCount = newQueryCount

        # Again, but should be no new query since have cached results last time
        newData = self.recommender(query)
        newQueryCount = self.recommender.dataManager.queryCount
        self.assertEqualRecommendedData(baselineData, newData, query)
        self.assertEqual(baselineQueryCount, newQueryCount)

        # Repeat multiple times, should still have no new query activity
        # prog = ProgressDots(10,1,"repeats");
        for iRepeat in xrange(10):
            newData = self.recommender(query)
            newQueryCount = self.recommender.dataManager.queryCount
            self.assertEqualRecommendedData(baselineData, newData, query)
            self.assertEqual(baselineQueryCount, newQueryCount)
            # prog.update();
        # prog.printStatus();

        # Query for subset should still yield no new query
        query.queryItemIds = set([-2])
        newData = self.recommender(query)
        newQueryCount = self.recommender.dataManager.queryCount
        baselineData = newData
        # New baseline for subset
        self.assertEqual(baselineQueryCount, newQueryCount)
        # Expect no queries for subsets

        # Repeat query for subset
        newData = self.recommender(query)
        newQueryCount = self.recommender.dataManager.queryCount
        self.assertEqualRecommendedData(baselineData, newData, query)
        self.assertEqual(baselineQueryCount, newQueryCount)
        # Expect no queries for subsets

        # Query for partial subset, partial new
        query.queryItemIds = set([-5, -6])
        newData = self.recommender(query)
        newQueryCount = self.recommender.dataManager.queryCount
        baselineData = newData
        # New baseline for subset
        self.assertEqual(baselineQueryCount, newQueryCount)
        # Expect now new queries for subsets, because first query should have done mass-all query

        # Repeat for partial subset, no longer new
        newData = self.recommender(query)
        newQueryCount = self.recommender.dataManager.queryCount
        baselineData = newData
        # New baseline for subset
        self.assertEqualRecommendedData(baselineData, newData, query)
        self.assertEqual(baselineQueryCount, newQueryCount)
Пример #4
0
    def test_recommender(self):
        # Run the recommender against the mock test data above and verify expected stats afterwards.

        query = RecommenderQuery()
        #query.queryItemIds = set();
        #query.excludeItemIds = set();
        #query.categoryIds = set();
        #query.timeDeltaMax = None;   # If set to one of the constants (DELTA_ZERO, DELTA_HOUR, etc.), will count item associations that occurred within that time delta as co-occurrent.  If left blank, will just consider all items within a given patient as co-occurrent.
        query.limit = 3
        # Just get top 3 ranks for simplicity
        query.maxRecommendedId = 0
        # Artificial constraint to focus only on test data

        log.debug(
            "Query with no item key input, just return ranks by general likelihood then."
        )
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-3], headers ),
                RowItemModel( [-6], headers ),
                RowItemModel( [-5], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug(
            "Query with key item inputs for which no data exists.  Effecitvely ignore it then, so just return ranks by general likelihood."
        )
        query.queryItemIds = set([-100])
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-3], headers ),
                RowItemModel( [-6], headers ),
                RowItemModel( [-5], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug("Query with category filter on recommended results.")
        query.queryItemIds = set([-100])
        query.excludeCategoryIds = set([-1, -4, -5, -6])
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-6], headers ),
                RowItemModel( [-5], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug(
            "Query with category filter and specific exclusion filter on recommended results."
        )
        query.queryItemIds = set([-100])
        query.excludeItemIds = set([-6])
        query.excludeCategoryIds = set([-1, -4, -5, -6])
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-5], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug(
            "General query with a couple of input clinical items + one with no association data (should effectively be ignored)."
        )
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set()
        query.excludeCategoryIds = set()
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-4], headers ),
                RowItemModel( [-6], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug(
            "General query but set a limit on time delta worth counting item associations"
        )
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set()
        query.excludeCategoryIds = set()
        query.timeDeltaMax = DELTA_HOUR
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-6], headers ),
                RowItemModel( [-4], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug("General query with category limit")
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set()
        query.excludeCategoryIds = set([-2, -4, -5, -6])
        query.timeDeltaMax = DELTA_HOUR
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-4], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)

        log.debug("General query with specific exclusion")
        query.queryItemIds = set([-2, -5, -100])
        query.excludeItemIds = set([-4, -3, -2])
        query.excludeCategoryIds = set()
        query.timeDeltaMax = DELTA_HOUR
        headers = ["clinical_item_id"]
        expectedData = \
            [   RowItemModel( [-6], headers ),
            ]
        recommendedData = self.recommender(query)
        self.assertEqualRecommendedData(expectedData, recommendedData, query)