Python TopicModel примеры использования

Язык программирования: Python

Пространство имен/Пакет: medinfo.cpoe.TopicModel

Класс/Тип: TopicModel

Примеров на hotexamples.com: 6

Python TopicModel - 6 примеров найдено. Это лучшие примеры Python кода для medinfo.cpoe.TopicModel.TopicModel, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TopicModel(4)

generateWeightByItemIdByTopicId(2)

loadModelAndDocCounts(2)

main(2)

enumerateTopics(1)

itemCountByIdToBagOfWords(1)

loadModel(1)

topTopicFilename(1)

Пример #1

Показать файл

Файл: step20.buildTopicModel.py Проект: sxu11/CDSS_UMich

def main_buildTopicModel(argv):
    bowInputFilename = SOURCE_DATA_DIR + INPUT_FILENAME

    mod = TopicModel()
    for numTopics in numTopicsOptions:
        subargv = ["TopicModel", "-n", str(numTopics)]
        subargv.extend([
            bowInputFilename,
            MODEL_DIR + "/topicModel." + os.path.basename(bowInputFilename) +
            ".%dTopic.model" % (numTopics),
        ])
        mod.main(subargv)
    return mod.model

Пример #2

Показать файл

    def __init__(self, model, docCountByWordId=None):
        """Initialize module with prior generated model and word document counts from TopicModel module.
        """
        BaseItemRecommender.__init__(self)
        self.modeler = TopicModel()
        # Utility instance to run off of

        if docCountByWordId:  # Specified both options
            self.model = model
            self.docCountByWordId = docCountByWordId
        else:  # If only the first one specified, interpret it as a base filename to load the objects from
            filename = model
            (self.model, self.docCountByWordId
             ) = self.modeler.loadModelAndDocCounts(filename)

        # Cached lookup data.  Don't repeat work for serial queries
        self.itemsById = None
        self.categoryIdByItemId = None
        self.candidateItemIds = None
        self.weightByItemIdByTopicId = None

Пример #3

Показать файл

def main_quickTest(argv):
    modelFilename = argv[1]
    modeler = TopicModel()

    timer = time.time()
    (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename)
    timer = time.time() - timer
    log.info("%.2f seconds to load", timer)

    timer = time.time()
    weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId(
        model, 100)
    timer = time.time() - timer
    log.info("%.2f seconds to generate weights", timer)

    for i in xrange(3):
        prog = ProgressDots()
        for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems():
            for (itemId, itemWeight) in weightByItemId.iteritems():
                prog.update()
        prog.printStatus()
    """

Пример #4

Показать файл

class TopicModelRecommender(BaseItemRecommender):
    """Implementation class for item (e.g., order) recommendation based on topic models 
    (LDA Latent Dirichlet Allocation or HDP Hierarchical Dirichlet Process).
    """
    def __init__(self, model, docCountByWordId=None):
        """Initialize module with prior generated model and word document counts from TopicModel module.
        """
        BaseItemRecommender.__init__(self)
        self.modeler = TopicModel()
        # Utility instance to run off of

        if docCountByWordId:  # Specified both options
            self.model = model
            self.docCountByWordId = docCountByWordId
        else:  # If only the first one specified, interpret it as a base filename to load the objects from
            filename = model
            (self.model, self.docCountByWordId
             ) = self.modeler.loadModelAndDocCounts(filename)

        # Cached lookup data.  Don't repeat work for serial queries
        self.itemsById = None
        self.categoryIdByItemId = None
        self.candidateItemIds = None
        self.weightByItemIdByTopicId = None

    def initItemLookups(self, query):
        self.itemsById = DBUtil.loadTableAsDict("clinical_item")
        self.categoryIdByItemId = dict()
        for itemId, item in self.itemsById.iteritems():
            self.categoryIdByItemId[itemId] = item["clinical_item_category_id"]
        self.candidateItemIds = set()
        emptyQuerySet = set()
        for itemId in self.docCountByWordId.keys():
            if self.isItemRecommendable(itemId, emptyQuerySet, query,
                                        self.categoryIdByItemId):
                self.candidateItemIds.add(itemId)

    def __call__(self, query):
        # Given query items, use model to find related topics with relationship scores

        # Load item category lookup information
        if self.itemsById is None:
            self.initItemLookups(query)

        # Load model weight parameters once to save time on serial queries
        if self.weightByItemIdByTopicId is None:
            self.weightByItemIdByTopicId = self.modeler.generateWeightByItemIdByTopicId(
                self.model, query.itemsPerCluster)

        # Adapt query into bag-of-words format
        queryItemCountById = query.queryItemIds
        if not isinstance(
                queryItemCountById, dict
        ):  # Not a dictionary, probably a one dimensional list/set, then just add counts of 1
            itemIds = queryItemCountById
            queryItemCountById = dict()
            for itemId in itemIds:
                queryItemCountById[itemId] = 1
        observedIds = set()
        queryBag = list(
            self.modeler.itemCountByIdToBagOfWords(queryItemCountById,
                                                   observedIds, self.itemsById,
                                                   query.excludeCategoryIds))

        # Primary model execute.  Apply to query to generate scored relationship to each "topic"
        topicWeights = self.model[queryBag]
        weightByTopicId = dict()
        for (topicId, topicWeight) in topicWeights:
            weightByTopicId[topicId] = topicWeight

        # Composite scores for (recommendable) items by taking weighted average across the top items for each topic
        recScoreByItemId = dict()
        for itemId in self.candidateItemIds:
            if self.isItemRecommendable(itemId, queryItemCountById, query,
                                        self.categoryIdByItemId):
                recScoreByItemId[itemId] = 0.0
        for topicId, topicWeight in weightByTopicId.iteritems():
            if topicWeight > query.minClusterWeight:  # Ignore topics with tiny contribution
                weightByItemId = self.weightByItemIdByTopicId[topicId]
                for itemId in recScoreByItemId.keys():
                    itemWeight = 0.0
                    if itemId in weightByItemId:
                        itemWeight = weightByItemId[itemId]
                    recScoreByItemId[itemId] += topicWeight * itemWeight

        # Build 2-pls with lists to sort by score
        recommendedData = list()
        for itemId, totalItemWeight in recScoreByItemId.iteritems():
            tfidf = 0.0
            if itemId in self.docCountByWordId and self.docCountByWordId[
                    itemId] > 0.0:
                tfidf = totalItemWeight * self.docCountByWordId[
                    None] / self.docCountByWordId[itemId]
                # Scale TF*IDF score based on baseline document counts to prioritize disproportionately common items
            itemModel = \
                {   "totalItemWeight": totalItemWeight, "tf": totalItemWeight, "PPV": totalItemWeight, "P(item|query)": totalItemWeight, "P(B|A)": totalItemWeight,
                    "tfidf": tfidf, "lift": tfidf, "interest": tfidf, "P(item|query)/P(item)": tfidf, "P(B|A)/P(B)": tfidf,
                    "clinical_item_id": itemId,
                    "weightByTopicId": weightByTopicId, "numSelectedTopics": len(weightByTopicId),  # Duplicate for each item, but persist here to enable retrieve by caller
                }
            itemModel["score"] = itemModel[query.sortField]
            recommendedData.append(itemModel)
        recommendedData.sort(RowItemFieldComparator("score"), reverse=True)
        return recommendedData

    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <queryStr> [<outputFile>]\n"+\
                    "   <queryStr> Query string to specify what recommendation items to retrieve.\n"+\
                    "       Refer to RecommenderQuery or HTML example code for elaboration of options\n"+\
                    "       Expect formatting like a URL query string: queryItemIds=1,2&resultCount=10&sortField=conditionalFreq&filterField0=baselineFreq<0.01...\n"+\
                    "       The sortField and filterFields will be used to determine what numerical / score columns to dislpay\n"+\
                    "   <outputFile>    Tab-delimited table of recommender results..\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)

        (options, args) = parser.parse_args(argv[1:])
        """

Пример #5

Показать файл

    def setUp(self):
        """Prepare state for test cases"""
        DBTestCase.setUp(self)

        log.info("Populate the database with test data")
        from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader
        ClinicalItemDataLoader.build_clinical_item_psql_schemata()

        self.clinicalItemCategoryIdStrList = list()
        headers = ["clinical_item_category_id", "source_table"]
        dataModels = \
            [
                RowItemModel( [-1, "Labs"], headers ),
                RowItemModel( [-2, "Imaging"], headers ),
                RowItemModel( [-3, "Meds"], headers ),
                RowItemModel( [-4, "Nursing"], headers ),
                RowItemModel( [-5, "Problems"], headers ),
                RowItemModel( [-6, "Lab Results"], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item_category",
                                              dataModel)
            self.clinicalItemCategoryIdStrList.append(str(dataItemId))

        headers = [
            "clinical_item_id", "clinical_item_category_id", "name",
            "analysis_status"
        ]
        dataModels = \
            [
                RowItemModel( [1, -1, "CBC",1], headers ),
                RowItemModel( [2, -1, "BMP",0], headers ), # Clear analysis status, so this will be ignored unless changed
                RowItemModel( [3, -1, "Hepatic Panel",1], headers ),
                RowItemModel( [4, -1, "Cardiac Enzymes",1], headers ),
                RowItemModel( [5, -2, "CXR",1], headers ),
                RowItemModel( [6, -2, "RUQ Ultrasound",1], headers ),
                RowItemModel( [7, -2, "CT Abdomen/Pelvis",1], headers ),
                RowItemModel( [8, -2, "CT PE Thorax",1], headers ),
                RowItemModel( [9, -3, "Acetaminophen",1], headers ),
                RowItemModel( [10, -3, "Carvedilol",1], headers ),
                RowItemModel( [11, -3, "Enoxaparin",1], headers ),
                RowItemModel( [12, -3, "Warfarin",1], headers ),
                RowItemModel( [13, -3, "Ceftriaxone",1], headers ),
                RowItemModel( [14, -4, "Foley Catheter",1], headers ),
                RowItemModel( [15, -4, "Strict I&O",1], headers ),
                RowItemModel( [16, -4, "Fall Precautions",1], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item", dataModel)

        # Input file contents in Bag-of-Words formats
        # Specifically avoid the use of items 6 or 7 in the training data
        self.inputBOWFileStr = \
"""[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5]]
[[3,4],[4,4],[9,3],[10,2],[12,6],[13,3],[15,5],[16,8]]
[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5],[9,1],[10,2],[11,1],[12,4],[13,10],[14,1],[15,3],[16,5]]
[[1,4],[2,9],[9,1],[10,2],[11,7],[12,4],[13,2],[16,6]]
[[4,3],[5,31],[8,5],[12,6],[13,8],[16,5]]
"""
        self.instance = TopicModel()

Пример #6

Показать файл

class TestTopicModel(DBTestCase):
    def setUp(self):
        """Prepare state for test cases"""
        DBTestCase.setUp(self)

        log.info("Populate the database with test data")
        from stride.clinical_item.ClinicalItemDataLoader import ClinicalItemDataLoader
        ClinicalItemDataLoader.build_clinical_item_psql_schemata()

        self.clinicalItemCategoryIdStrList = list()
        headers = ["clinical_item_category_id", "source_table"]
        dataModels = \
            [
                RowItemModel( [-1, "Labs"], headers ),
                RowItemModel( [-2, "Imaging"], headers ),
                RowItemModel( [-3, "Meds"], headers ),
                RowItemModel( [-4, "Nursing"], headers ),
                RowItemModel( [-5, "Problems"], headers ),
                RowItemModel( [-6, "Lab Results"], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item_category",
                                              dataModel)
            self.clinicalItemCategoryIdStrList.append(str(dataItemId))

        headers = [
            "clinical_item_id", "clinical_item_category_id", "name",
            "analysis_status"
        ]
        dataModels = \
            [
                RowItemModel( [1, -1, "CBC",1], headers ),
                RowItemModel( [2, -1, "BMP",0], headers ), # Clear analysis status, so this will be ignored unless changed
                RowItemModel( [3, -1, "Hepatic Panel",1], headers ),
                RowItemModel( [4, -1, "Cardiac Enzymes",1], headers ),
                RowItemModel( [5, -2, "CXR",1], headers ),
                RowItemModel( [6, -2, "RUQ Ultrasound",1], headers ),
                RowItemModel( [7, -2, "CT Abdomen/Pelvis",1], headers ),
                RowItemModel( [8, -2, "CT PE Thorax",1], headers ),
                RowItemModel( [9, -3, "Acetaminophen",1], headers ),
                RowItemModel( [10, -3, "Carvedilol",1], headers ),
                RowItemModel( [11, -3, "Enoxaparin",1], headers ),
                RowItemModel( [12, -3, "Warfarin",1], headers ),
                RowItemModel( [13, -3, "Ceftriaxone",1], headers ),
                RowItemModel( [14, -4, "Foley Catheter",1], headers ),
                RowItemModel( [15, -4, "Strict I&O",1], headers ),
                RowItemModel( [16, -4, "Fall Precautions",1], headers ),
            ]
        for dataModel in dataModels:
            (dataItemId,
             isNew) = DBUtil.findOrInsertItem("clinical_item", dataModel)

        # Input file contents in Bag-of-Words formats
        # Specifically avoid the use of items 6 or 7 in the training data
        self.inputBOWFileStr = \
"""[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5]]
[[3,4],[4,4],[9,3],[10,2],[12,6],[13,3],[15,5],[16,8]]
[[1,1],[2,2],[3,1],[4,4],[5,10],[8,5],[9,1],[10,2],[11,1],[12,4],[13,10],[14,1],[15,3],[16,5]]
[[1,4],[2,9],[9,1],[10,2],[11,7],[12,4],[13,2],[16,6]]
[[4,3],[5,31],[8,5],[12,6],[13,8],[16,5]]
"""
        self.instance = TopicModel()
        # Instance to test on

    def tearDown(self):
        """Restore state from any setUp or test steps"""
        log.info("Purge test records from the database")
        DBUtil.execute(
            "delete from clinical_item where clinical_item_category_id < 0")
        DBUtil.execute(
            "delete from clinical_item_category where clinical_item_category_id in (%s)"
            % str.join(",", self.clinicalItemCategoryIdStrList))

        for filename in os.listdir("."):
            if filename.startswith(TEST_FILE_PREFIX) or filename.startswith(
                    "HDP" + TEST_FILE_PREFIX):
                os.remove(filename)

        DBTestCase.tearDown(self)

    def test_topicModel(self):
        # Run the modeling analysis against the mock test data above and verify expected stats afterwards.

        numTopics = 3
        sys.stdin = StringIO(self.inputBOWFileStr)
        subargv = [
            "TopicModel", "-n",
            str(numTopics), "-i",
            str(ITEMS_PER_TOPIC), "-", TEST_FILE_PREFIX
        ]
        self.instance.main(subargv)

        model = self.instance.loadModel(TEST_FILE_PREFIX)
        topTopicFile = open(self.instance.topTopicFilename(TEST_FILE_PREFIX))

        expectedDocCountByWordId = \
                {1:3, 2:3, 3:3, 4:4, 5:3, None:5, 9:3, 10:3, 11:2, 12:4, 13:4, 14:1, 15:2, 16:4, 8:3}
        self.assertExpectedTopItems(expectedDocCountByWordId, model,
                                    topTopicFile)

        # Do again but with HDP non-parametric model
        numTopics = 0
        sys.stdin = StringIO(self.inputBOWFileStr)
        subargv = [
            "TopicModel", "-n",
            str(numTopics), "-i",
            str(ITEMS_PER_TOPIC), "-", "HDP" + TEST_FILE_PREFIX
        ]
        self.instance.main(subargv)

        model = self.instance.loadModel("HDP" + TEST_FILE_PREFIX)
        topTopicFile = open(
            self.instance.topTopicFilename("HDP" + TEST_FILE_PREFIX))

        expectedDocCountByWordId = \
                {1:3, 2:3, 3:3, 4:4, 5:3, None:5, 9:3, 10:3, 11:2, 12:4, 13:4, 14:1, 15:2, 16:4, 8:3}
        self.assertExpectedTopItems(expectedDocCountByWordId, model,
                                    topTopicFile)

    def assertExpectedTopItems(self, expectedDocCountByWordId, model,
                               topTopicFile):
        # With randomized optimization algorithm, cannot depend on stable
        # Test results with each run.  Instead make sure internally consistent,
        #   and that raw count data is consistent

        # Values from model topic parameters
        scoreByItemIdByTopicId = dict()
        for (topicId, topicItems) in self.instance.enumerateTopics(
                model, ITEMS_PER_TOPIC):
            scoreByItemIdByTopicId[topicId] = dict()
            for (itemId, score) in topicItems:
                scoreByItemIdByTopicId[topicId][itemId] = score
        # Add expected word document counts under the "None" topic
        scoreByItemIdByTopicId[None] = expectedDocCountByWordId

        # Verify Top Topic Files match
        topScoreByItemIdByTopicId = dict()
        itemsChecked = 0
        reader = TabDictReader(topTopicFile)
        for topicItem in reader:
            topicId = None
            if topicItem["topic_id"] != NULL_STRING:
                topicId = int(topicItem["topic_id"])
            itemId = None
            if topicItem["item_id"] != NULL_STRING:
                itemId = int(topicItem["item_id"])
            score = float(topicItem["score"])
            tfidf = float(topicItem["tfidf"])

            expectedTFIDF = 0.0
            if itemId in expectedDocCountByWordId and expectedDocCountByWordId[
                    itemId] > 0:
                expectedTFIDF = score * expectedDocCountByWordId[
                    None] / expectedDocCountByWordId[itemId]

            #print >> sys.stderr, topicId, itemId, score, tfidf, expectedDocCountByWordId[itemId]
            self.assertAlmostEqual(expectedTFIDF, tfidf, places=5)

            if topicId not in topScoreByItemIdByTopicId:
                topScoreByItemIdByTopicId[topicId] = dict()
            topScoreByItemIdByTopicId[topicId][itemId] = score
            itemsChecked += 1
        self.assertTrue(itemsChecked > 0)
        # Make sure an actual test happened

        for topicId, topScoreByItemId in topScoreByItemIdByTopicId.items():
            scoreByItemId = scoreByItemIdByTopicId[topicId]
            self.assertAlmostEqualsDict(topScoreByItemId,
                                        scoreByItemId,
                                        places=5)