예제 #1
0
def showData():
    print("总app数量:" + str(MongoUtil.count("app_table")))
    locationCount = 0
    catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE))
    for cataname in catas:
        cataname = cataname.strip()
        print(cataname +" 数量:" + str(MongoUtil.count(cataname)))
        locationCount += len(MongoUtil.distinct_count(cataname, "appid"))
    print("获取评论的app数量:"+str(locationCount),end="\n\n")
    print("word数量:" + str(MongoUtil.count("word_table")))
예제 #2
0
def showData(cataname):
    print("总app数量:" + str(MongoUtil.count("app_table")))
    print("word数量:" + str(MongoUtil.count("word_table")))
    appCount = MongoUtil.find("app_table", {"catagory":cataname}).count()
    print(cataname+"的 app数量: "+str(appCount))
    locationCount = 0
    cataname = cataname.strip()
    print(cataname +"的 location 数量:" + str(MongoUtil.count(cataname)))
    locationCount += len(MongoUtil.distinct_count(cataname, "appid"))
    print("已获取评论的 app数量:"+str(locationCount))
    print("未获取评论的 app数量:"+str(appCount-locationCount))
예제 #3
0
    def tf_idf(self):

        if self.worddict == None or len(self.worddict) == 0:
            print("请初始化词频统计")
            return
        if self.wordcount < 100:
            print("该app的评论数量过少,获取关键词将会不准确")
            return

        #文档总数
        docu_count = len(
            MongoUtil.distinct_count(self.app["catagory"], "appid",
                                     value=None))
        #减去它本身
        docu_count -= 1

        tf_idfdict = {}
        for item in self.worddict.items():
            result = MongoUtil.find_one("word_table", {"word": item[0]})
            wordid = result["_id"]
            include_count = len(
                MongoUtil.distinct_count(self.app["catagory"],
                                         "appid",
                                         value={"wordid": wordid}))
            #减去它本身
            include_count -= 1

            # print(item[0]+"->"+str(item[1])+"  包含的总文档数"+str(include_count))
            # print(str(docu_count) + " "+str(include_count))
            if docu_count <= 0:
                docu_count = 0

            wordidf = float(math.log(docu_count / (include_count + 1)))
            wordtf = float(item[1] / self.wordcount)
            tf_idfdict[item[0]] = wordtf * wordidf

        for item in tf_idfdict.items():
            print(item[0] + "    出现的次数:" + str(self.worddict[item[0]]) +
                  "     tf-idf计算值:" + str(item[1]))

        return tf_idfdict