def showData(): print("总app数量:" + str(MongoUtil.count("app_table"))) locationCount = 0 catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE)) for cataname in catas: cataname = cataname.strip() print(cataname +" 数量:" + str(MongoUtil.count(cataname))) locationCount += len(MongoUtil.distinct_count(cataname, "appid")) print("获取评论的app数量:"+str(locationCount),end="\n\n") print("word数量:" + str(MongoUtil.count("word_table")))
def showData(cataname): print("总app数量:" + str(MongoUtil.count("app_table"))) print("word数量:" + str(MongoUtil.count("word_table"))) appCount = MongoUtil.find("app_table", {"catagory":cataname}).count() print(cataname+"的 app数量: "+str(appCount)) locationCount = 0 cataname = cataname.strip() print(cataname +"的 location 数量:" + str(MongoUtil.count(cataname))) locationCount += len(MongoUtil.distinct_count(cataname, "appid")) print("已获取评论的 app数量:"+str(locationCount)) print("未获取评论的 app数量:"+str(appCount-locationCount))
def tf_idf(self): if self.worddict == None or len(self.worddict) == 0: print("请初始化词频统计") return if self.wordcount < 100: print("该app的评论数量过少,获取关键词将会不准确") return #文档总数 docu_count = len( MongoUtil.distinct_count(self.app["catagory"], "appid", value=None)) #减去它本身 docu_count -= 1 tf_idfdict = {} for item in self.worddict.items(): result = MongoUtil.find_one("word_table", {"word": item[0]}) wordid = result["_id"] include_count = len( MongoUtil.distinct_count(self.app["catagory"], "appid", value={"wordid": wordid})) #减去它本身 include_count -= 1 # print(item[0]+"->"+str(item[1])+" 包含的总文档数"+str(include_count)) # print(str(docu_count) + " "+str(include_count)) if docu_count <= 0: docu_count = 0 wordidf = float(math.log(docu_count / (include_count + 1))) wordtf = float(item[1] / self.wordcount) tf_idfdict[item[0]] = wordtf * wordidf for item in tf_idfdict.items(): print(item[0] + " 出现的次数:" + str(self.worddict[item[0]]) + " tf-idf计算值:" + str(item[1])) return tf_idfdict