Пример #1
0
def main_buildTopicModel(argv):
    bowInputFilename = SOURCE_DATA_DIR + INPUT_FILENAME

    mod = TopicModel()
    for numTopics in numTopicsOptions:
        subargv = ["TopicModel", "-n", str(numTopics)]
        subargv.extend([
            bowInputFilename,
            MODEL_DIR + "/topicModel." + os.path.basename(bowInputFilename) +
            ".%dTopic.model" % (numTopics),
        ])
        mod.main(subargv)
    return mod.model
Пример #2
0
    def __init__(self, model, docCountByWordId=None):
        """Initialize module with prior generated model and word document counts from TopicModel module.
        """
        BaseItemRecommender.__init__(self)
        self.modeler = TopicModel()
        # Utility instance to run off of

        if docCountByWordId:  # Specified both options
            self.model = model
            self.docCountByWordId = docCountByWordId
        else:  # If only the first one specified, interpret it as a base filename to load the objects from
            filename = model
            (self.model, self.docCountByWordId
             ) = self.modeler.loadModelAndDocCounts(filename)

        # Cached lookup data.  Don't repeat work for serial queries
        self.itemsById = None
        self.categoryIdByItemId = None
        self.candidateItemIds = None
        self.weightByItemIdByTopicId = None
Пример #3
0
def main_quickTest(argv):
    modelFilename = argv[1]
    modeler = TopicModel()

    timer = time.time()
    (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename)
    timer = time.time() - timer
    log.info("%.2f seconds to load", timer)

    timer = time.time()
    weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId(
        model, 100)
    timer = time.time() - timer
    log.info("%.2f seconds to generate weights", timer)

    for i in xrange(3):
        prog = ProgressDots()
        for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems():
            for (itemId, itemWeight) in weightByItemId.iteritems():
                prog.update()
        prog.printStatus()
    """
Пример #4
0
class TopicModelRecommender(BaseItemRecommender):
    """Implementation class for item (e.g., order) recommendation based on topic models 
    (LDA Latent Dirichlet Allocation or HDP Hierarchical Dirichlet Process).
    """
    def __init__(self, model, docCountByWordId=None):
        """Initialize module with prior generated model and word document counts from TopicModel module.
        """
        BaseItemRecommender.__init__(self)
        self.modeler = TopicModel()
        # Utility instance to run off of

        if docCountByWordId:  # Specified both options
            self.model = model
            self.docCountByWordId = docCountByWordId
        else:  # If only the first one specified, interpret it as a base filename to load the objects from
            filename = model
            (self.model, self.docCountByWordId
             ) = self.modeler.loadModelAndDocCounts(filename)

        # Cached lookup data.  Don't repeat work for serial queries
        self.itemsById = None
        self.categoryIdByItemId = None
        self.candidateItemIds = None
        self.weightByItemIdByTopicId = None

    def initItemLookups(self, query):
        self.itemsById = DBUtil.loadTableAsDict("clinical_item")
        self.categoryIdByItemId = dict()
        for itemId, item in self.itemsById.iteritems():
            self.categoryIdByItemId[itemId] = item["clinical_item_category_id"]
        self.candidateItemIds = set()
        emptyQuerySet = set()
        for itemId in self.docCountByWordId.keys():
            if self.isItemRecommendable(itemId, emptyQuerySet, query,
                                        self.categoryIdByItemId):
                self.candidateItemIds.add(itemId)

    def __call__(self, query):
        # Given query items, use model to find related topics with relationship scores

        # Load item category lookup information
        if self.itemsById is None:
            self.initItemLookups(query)

        # Load model weight parameters once to save time on serial queries
        if self.weightByItemIdByTopicId is None:
            self.weightByItemIdByTopicId = self.modeler.generateWeightByItemIdByTopicId(
                self.model, query.itemsPerCluster)

        # Adapt query into bag-of-words format
        queryItemCountById = query.queryItemIds
        if not isinstance(
                queryItemCountById, dict
        ):  # Not a dictionary, probably a one dimensional list/set, then just add counts of 1
            itemIds = queryItemCountById
            queryItemCountById = dict()
            for itemId in itemIds:
                queryItemCountById[itemId] = 1
        observedIds = set()
        queryBag = list(
            self.modeler.itemCountByIdToBagOfWords(queryItemCountById,
                                                   observedIds, self.itemsById,
                                                   query.excludeCategoryIds))

        # Primary model execute.  Apply to query to generate scored relationship to each "topic"
        topicWeights = self.model[queryBag]
        weightByTopicId = dict()
        for (topicId, topicWeight) in topicWeights:
            weightByTopicId[topicId] = topicWeight

        # Composite scores for (recommendable) items by taking weighted average across the top items for each topic
        recScoreByItemId = dict()
        for itemId in self.candidateItemIds:
            if self.isItemRecommendable(itemId, queryItemCountById, query,
                                        self.categoryIdByItemId):
                recScoreByItemId[itemId] = 0.0
        for topicId, topicWeight in weightByTopicId.iteritems():
            if topicWeight > query.minClusterWeight:  # Ignore topics with tiny contribution
                weightByItemId = self.weightByItemIdByTopicId[topicId]
                for itemId in recScoreByItemId.keys():
                    itemWeight = 0.0
                    if itemId in weightByItemId:
                        itemWeight = weightByItemId[itemId]
                    recScoreByItemId[itemId] += topicWeight * itemWeight

        # Build 2-pls with lists to sort by score
        recommendedData = list()
        for itemId, totalItemWeight in recScoreByItemId.iteritems():
            tfidf = 0.0
            if itemId in self.docCountByWordId and self.docCountByWordId[
                    itemId] > 0.0:
                tfidf = totalItemWeight * self.docCountByWordId[
                    None] / self.docCountByWordId[itemId]
                # Scale TF*IDF score based on baseline document counts to prioritize disproportionately common items
            itemModel = \
                {   "totalItemWeight": totalItemWeight, "tf": totalItemWeight, "PPV": totalItemWeight, "P(item|query)": totalItemWeight, "P(B|A)": totalItemWeight,
                    "tfidf": tfidf, "lift": tfidf, "interest": tfidf, "P(item|query)/P(item)": tfidf, "P(B|A)/P(B)": tfidf,
                    "clinical_item_id": itemId,
                    "weightByTopicId": weightByTopicId, "numSelectedTopics": len(weightByTopicId),  # Duplicate for each item, but persist here to enable retrieve by caller
                }
            itemModel["score"] = itemModel[query.sortField]
            recommendedData.append(itemModel)
        recommendedData.sort(RowItemFieldComparator("score"), reverse=True)
        return recommendedData

    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <queryStr> [<outputFile>]\n"+\
                    "   <queryStr> Query string to specify what recommendation items to retrieve.\n"+\
                    "       Refer to RecommenderQuery or HTML example code for elaboration of options\n"+\
                    "       Expect formatting like a URL query string: queryItemIds=1,2&resultCount=10&sortField=conditionalFreq&filterField0=baselineFreq<0.01...\n"+\
                    "       The sortField and filterFields will be used to determine what numerical / score columns to dislpay\n"+\
                    "   <outputFile>    Tab-delimited table of recommender results..\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)

        (options, args) = parser.parse_args(argv[1:])
        """
Пример #5
0
    def setUp(self):
        """Prepare state for test cases"""
        DBTestCase.setUp(self)

        log.info("Populate the database with test data")
        from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader
        ClinicalItemDataLoader.build_clinical_item_psql_schemata()

        self.clinicalItemCategoryIdStrList = list()
        headers = ["clinical_item_category_id", "source_table"]
        dataModels = \
            [
                RowItemModel( [-1, "Labs"], headers ),
                RowItemModel( [-2, "Imaging"], headers ),
                RowItemModel( [-3, "Meds"], headers ),
                RowItemModel( [-4, "Nursing"], headers ),
                RowItemModel( [-5, "Problems"], headers ),
                RowItemModel( [-6, "Lab Results"], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item_category",
                                              dataModel)
            self.clinicalItemCategoryIdStrList.append(str(dataItemId))

        headers = [
            "clinical_item_id", "clinical_item_category_id", "name",
            "analysis_status"
        ]
        dataModels = \
            [
                RowItemModel( [1, -1, "CBC",1], headers ),
                RowItemModel( [2, -1, "BMP",0], headers ), # Clear analysis status, so this will be ignored unless changed
                RowItemModel( [3, -1, "Hepatic Panel",1], headers ),
                RowItemModel( [4, -1, "Cardiac Enzymes",1], headers ),
                RowItemModel( [5, -2, "CXR",1], headers ),
                RowItemModel( [6, -2, "RUQ Ultrasound",1], headers ),
                RowItemModel( [7, -2, "CT Abdomen/Pelvis",1], headers ),
                RowItemModel( [8, -2, "CT PE Thorax",1], headers ),
                RowItemModel( [9, -3, "Acetaminophen",1], headers ),
                RowItemModel( [10, -3, "Carvedilol",1], headers ),
                RowItemModel( [11, -3, "Enoxaparin",1], headers ),
                RowItemModel( [12, -3, "Warfarin",1], headers ),
                RowItemModel( [13, -3, "Ceftriaxone",1], headers ),
                RowItemModel( [14, -4, "Foley Catheter",1], headers ),
                RowItemModel( [15, -4, "Strict I&O",1], headers ),
                RowItemModel( [16, -4, "Fall Precautions",1], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item", dataModel)

        # Input file contents in Bag-of-Words formats
        # Specifically avoid the use of items 6 or 7 in the training data
        self.inputBOWFileStr = \
"""[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5]]
[[3,4],[4,4],[9,3],[10,2],[12,6],[13,3],[15,5],[16,8]]
[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5],[9,1],[10,2],[11,1],[12,4],[13,10],[14,1],[15,3],[16,5]]
[[1,4],[2,9],[9,1],[10,2],[11,7],[12,4],[13,2],[16,6]]
[[4,3],[5,31],[8,5],[12,6],[13,8],[16,5]]
"""
        self.instance = TopicModel()
Пример #6
0
class TestTopicModel(DBTestCase):
    def setUp(self):
        """Prepare state for test cases"""
        DBTestCase.setUp(self)

        log.info("Populate the database with test data")
        from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader
        ClinicalItemDataLoader.build_clinical_item_psql_schemata()

        self.clinicalItemCategoryIdStrList = list()
        headers = ["clinical_item_category_id", "source_table"]
        dataModels = \
            [
                RowItemModel( [-1, "Labs"], headers ),
                RowItemModel( [-2, "Imaging"], headers ),
                RowItemModel( [-3, "Meds"], headers ),
                RowItemModel( [-4, "Nursing"], headers ),
                RowItemModel( [-5, "Problems"], headers ),
                RowItemModel( [-6, "Lab Results"], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item_category",
                                              dataModel)
            self.clinicalItemCategoryIdStrList.append(str(dataItemId))

        headers = [
            "clinical_item_id", "clinical_item_category_id", "name",
            "analysis_status"
        ]
        dataModels = \
            [
                RowItemModel( [1, -1, "CBC",1], headers ),
                RowItemModel( [2, -1, "BMP",0], headers ), # Clear analysis status, so this will be ignored unless changed
                RowItemModel( [3, -1, "Hepatic Panel",1], headers ),
                RowItemModel( [4, -1, "Cardiac Enzymes",1], headers ),
                RowItemModel( [5, -2, "CXR",1], headers ),
                RowItemModel( [6, -2, "RUQ Ultrasound",1], headers ),
                RowItemModel( [7, -2, "CT Abdomen/Pelvis",1], headers ),
                RowItemModel( [8, -2, "CT PE Thorax",1], headers ),
                RowItemModel( [9, -3, "Acetaminophen",1], headers ),
                RowItemModel( [10, -3, "Carvedilol",1], headers ),
                RowItemModel( [11, -3, "Enoxaparin",1], headers ),
                RowItemModel( [12, -3, "Warfarin",1], headers ),
                RowItemModel( [13, -3, "Ceftriaxone",1], headers ),
                RowItemModel( [14, -4, "Foley Catheter",1], headers ),
                RowItemModel( [15, -4, "Strict I&O",1], headers ),
                RowItemModel( [16, -4, "Fall Precautions",1], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item", dataModel)

        # Input file contents in Bag-of-Words formats
        # Specifically avoid the use of items 6 or 7 in the training data
        self.inputBOWFileStr = \
"""[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5]]
[[3,4],[4,4],[9,3],[10,2],[12,6],[13,3],[15,5],[16,8]]
[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5],[9,1],[10,2],[11,1],[12,4],[13,10],[14,1],[15,3],[16,5]]
[[1,4],[2,9],[9,1],[10,2],[11,7],[12,4],[13,2],[16,6]]
[[4,3],[5,31],[8,5],[12,6],[13,8],[16,5]]
"""
        self.instance = TopicModel()
        # Instance to test on

    def tearDown(self):
        """Restore state from any setUp or test steps"""
        log.info("Purge test records from the database")
        DBUtil.execute(
            "delete from clinical_item where clinical_item_category_id < 0")
        DBUtil.execute(
            "delete from clinical_item_category where clinical_item_category_id in (%s)"
            % str.join(",", self.clinicalItemCategoryIdStrList))

        for filename in os.listdir("."):
            if filename.startswith(TEST_FILE_PREFIX) or filename.startswith(
                    "HDP" + TEST_FILE_PREFIX):
                os.remove(filename)

        DBTestCase.tearDown(self)

    def test_topicModel(self):
        # Run the modeling analysis against the mock test data above and verify expected stats afterwards.

        numTopics = 3
        sys.stdin = StringIO(self.inputBOWFileStr)
        subargv = [
            "TopicModel", "-n",
            str(numTopics), "-i",
            str(ITEMS_PER_TOPIC), "-", TEST_FILE_PREFIX
        ]
        self.instance.main(subargv)

        model = self.instance.loadModel(TEST_FILE_PREFIX)
        topTopicFile = open(self.instance.topTopicFilename(TEST_FILE_PREFIX))

        expectedDocCountByWordId = \
                {1:3, 2:3, 3:3, 4:4, 5:3, None:5, 9:3, 10:3, 11:2, 12:4, 13:4, 14:1, 15:2, 16:4, 8:3}
        self.assertExpectedTopItems(expectedDocCountByWordId, model,
                                    topTopicFile)

        # Do again but with HDP non-parametric model
        numTopics = 0
        sys.stdin = StringIO(self.inputBOWFileStr)
        subargv = [
            "TopicModel", "-n",
            str(numTopics), "-i",
            str(ITEMS_PER_TOPIC), "-", "HDP" + TEST_FILE_PREFIX
        ]
        self.instance.main(subargv)

        model = self.instance.loadModel("HDP" + TEST_FILE_PREFIX)
        topTopicFile = open(
            self.instance.topTopicFilename("HDP" + TEST_FILE_PREFIX))

        expectedDocCountByWordId = \
                {1:3, 2:3, 3:3, 4:4, 5:3, None:5, 9:3, 10:3, 11:2, 12:4, 13:4, 14:1, 15:2, 16:4, 8:3}
        self.assertExpectedTopItems(expectedDocCountByWordId, model,
                                    topTopicFile)

    def assertExpectedTopItems(self, expectedDocCountByWordId, model,
                               topTopicFile):
        # With randomized optimization algorithm, cannot depend on stable
        # Test results with each run.  Instead make sure internally consistent,
        #   and that raw count data is consistent

        # Values from model topic parameters
        scoreByItemIdByTopicId = dict()
        for (topicId, topicItems) in self.instance.enumerateTopics(
                model, ITEMS_PER_TOPIC):
            scoreByItemIdByTopicId[topicId] = dict()
            for (itemId, score) in topicItems:
                scoreByItemIdByTopicId[topicId][itemId] = score
        # Add expected word document counts under the "None" topic
        scoreByItemIdByTopicId[None] = expectedDocCountByWordId

        # Verify Top Topic Files match
        topScoreByItemIdByTopicId = dict()
        itemsChecked = 0
        reader = TabDictReader(topTopicFile)
        for topicItem in reader:
            topicId = None
            if topicItem["topic_id"] != NULL_STRING:
                topicId = int(topicItem["topic_id"])
            itemId = None
            if topicItem["item_id"] != NULL_STRING:
                itemId = int(topicItem["item_id"])
            score = float(topicItem["score"])
            tfidf = float(topicItem["tfidf"])

            expectedTFIDF = 0.0
            if itemId in expectedDocCountByWordId and expectedDocCountByWordId[
                    itemId] > 0:
                expectedTFIDF = score * expectedDocCountByWordId[
                    None] / expectedDocCountByWordId[itemId]

            #print >> sys.stderr, topicId, itemId, score, tfidf, expectedDocCountByWordId[itemId]
            self.assertAlmostEqual(expectedTFIDF, tfidf, places=5)

            if topicId not in topScoreByItemIdByTopicId:
                topScoreByItemIdByTopicId[topicId] = dict()
            topScoreByItemIdByTopicId[topicId][itemId] = score
            itemsChecked += 1
        self.assertTrue(itemsChecked > 0)
        # Make sure an actual test happened

        for topicId, topScoreByItemId in topScoreByItemIdByTopicId.items():
            scoreByItemId = scoreByItemIdByTopicId[topicId]
            self.assertAlmostEqualsDict(topScoreByItemId,
                                        scoreByItemId,
                                        places=5)