예제 #1
0
def main_quickTest(argv):
    modelFilename = argv[1]
    modeler = TopicModel()

    timer = time.time()
    (model, docCountByWordId) = modeler.loadModelAndDocCounts(modelFilename)
    timer = time.time() - timer
    log.info("%.2f seconds to load", timer)

    timer = time.time()
    weightByItemIdByTopicId = modeler.generateWeightByItemIdByTopicId(
        model, 100)
    timer = time.time() - timer
    log.info("%.2f seconds to generate weights", timer)

    for i in xrange(3):
        prog = ProgressDots()
        for (topicId, weightByItemId) in weightByItemIdByTopicId.iteritems():
            for (itemId, itemWeight) in weightByItemId.iteritems():
                prog.update()
        prog.printStatus()
    """
예제 #2
0
class TopicModelRecommender(BaseItemRecommender):
    """Implementation class for item (e.g., order) recommendation based on topic models 
    (LDA Latent Dirichlet Allocation or HDP Hierarchical Dirichlet Process).
    """
    def __init__(self, model, docCountByWordId=None):
        """Initialize module with prior generated model and word document counts from TopicModel module.
        """
        BaseItemRecommender.__init__(self)
        self.modeler = TopicModel()
        # Utility instance to run off of

        if docCountByWordId:  # Specified both options
            self.model = model
            self.docCountByWordId = docCountByWordId
        else:  # If only the first one specified, interpret it as a base filename to load the objects from
            filename = model
            (self.model, self.docCountByWordId
             ) = self.modeler.loadModelAndDocCounts(filename)

        # Cached lookup data.  Don't repeat work for serial queries
        self.itemsById = None
        self.categoryIdByItemId = None
        self.candidateItemIds = None
        self.weightByItemIdByTopicId = None

    def initItemLookups(self, query):
        self.itemsById = DBUtil.loadTableAsDict("clinical_item")
        self.categoryIdByItemId = dict()
        for itemId, item in self.itemsById.iteritems():
            self.categoryIdByItemId[itemId] = item["clinical_item_category_id"]
        self.candidateItemIds = set()
        emptyQuerySet = set()
        for itemId in self.docCountByWordId.keys():
            if self.isItemRecommendable(itemId, emptyQuerySet, query,
                                        self.categoryIdByItemId):
                self.candidateItemIds.add(itemId)

    def __call__(self, query):
        # Given query items, use model to find related topics with relationship scores

        # Load item category lookup information
        if self.itemsById is None:
            self.initItemLookups(query)

        # Load model weight parameters once to save time on serial queries
        if self.weightByItemIdByTopicId is None:
            self.weightByItemIdByTopicId = self.modeler.generateWeightByItemIdByTopicId(
                self.model, query.itemsPerCluster)

        # Adapt query into bag-of-words format
        queryItemCountById = query.queryItemIds
        if not isinstance(
                queryItemCountById, dict
        ):  # Not a dictionary, probably a one dimensional list/set, then just add counts of 1
            itemIds = queryItemCountById
            queryItemCountById = dict()
            for itemId in itemIds:
                queryItemCountById[itemId] = 1
        observedIds = set()
        queryBag = list(
            self.modeler.itemCountByIdToBagOfWords(queryItemCountById,
                                                   observedIds, self.itemsById,
                                                   query.excludeCategoryIds))

        # Primary model execute.  Apply to query to generate scored relationship to each "topic"
        topicWeights = self.model[queryBag]
        weightByTopicId = dict()
        for (topicId, topicWeight) in topicWeights:
            weightByTopicId[topicId] = topicWeight

        # Composite scores for (recommendable) items by taking weighted average across the top items for each topic
        recScoreByItemId = dict()
        for itemId in self.candidateItemIds:
            if self.isItemRecommendable(itemId, queryItemCountById, query,
                                        self.categoryIdByItemId):
                recScoreByItemId[itemId] = 0.0
        for topicId, topicWeight in weightByTopicId.iteritems():
            if topicWeight > query.minClusterWeight:  # Ignore topics with tiny contribution
                weightByItemId = self.weightByItemIdByTopicId[topicId]
                for itemId in recScoreByItemId.keys():
                    itemWeight = 0.0
                    if itemId in weightByItemId:
                        itemWeight = weightByItemId[itemId]
                    recScoreByItemId[itemId] += topicWeight * itemWeight

        # Build 2-pls with lists to sort by score
        recommendedData = list()
        for itemId, totalItemWeight in recScoreByItemId.iteritems():
            tfidf = 0.0
            if itemId in self.docCountByWordId and self.docCountByWordId[
                    itemId] > 0.0:
                tfidf = totalItemWeight * self.docCountByWordId[
                    None] / self.docCountByWordId[itemId]
                # Scale TF*IDF score based on baseline document counts to prioritize disproportionately common items
            itemModel = \
                {   "totalItemWeight": totalItemWeight, "tf": totalItemWeight, "PPV": totalItemWeight, "P(item|query)": totalItemWeight, "P(B|A)": totalItemWeight,
                    "tfidf": tfidf, "lift": tfidf, "interest": tfidf, "P(item|query)/P(item)": tfidf, "P(B|A)/P(B)": tfidf,
                    "clinical_item_id": itemId,
                    "weightByTopicId": weightByTopicId, "numSelectedTopics": len(weightByTopicId),  # Duplicate for each item, but persist here to enable retrieve by caller
                }
            itemModel["score"] = itemModel[query.sortField]
            recommendedData.append(itemModel)
        recommendedData.sort(RowItemFieldComparator("score"), reverse=True)
        return recommendedData

    def main(self, argv):
        """Main method, callable from command line"""
        usageStr =  "usage: %prog [options] <queryStr> [<outputFile>]\n"+\
                    "   <queryStr> Query string to specify what recommendation items to retrieve.\n"+\
                    "       Refer to RecommenderQuery or HTML example code for elaboration of options\n"+\
                    "       Expect formatting like a URL query string: queryItemIds=1,2&resultCount=10&sortField=conditionalFreq&filterField0=baselineFreq<0.01...\n"+\
                    "       The sortField and filterFields will be used to determine what numerical / score columns to dislpay\n"+\
                    "   <outputFile>    Tab-delimited table of recommender results..\n"+\
                    "                       Leave blank or specify \"-\" to send to stdout.\n"
        parser = OptionParser(usage=usageStr)

        (options, args) = parser.parse_args(argv[1:])
        """