def getDowloadCapacity(appname, cataname=""): app_capacity = {} capacity_table = "capacity_table" app_table = "app_table" if cataname == "": app = MongoUtil.find_one(app_table, {"appname": appname}) else: app = MongoUtil.find_one(app_table, { "catagory": cataname, "appname": appname }) if app == None: print(cataname + appname + "不存在") return else: app_id = app["_id"] cur = MongoUtil.find(capacity_table, {"appid": app_id}) for item in cur: date = item["date"] capacity = item["capacity"] if not capacity.isdigit(): capacity = install2num(capacity) app_capacity[date] = capacity return app_capacity
def saveCommentEmotionData(model,best_words,app): time.sleep(1) appid = app["_id"] appname = app["appname"] cataname = app["catagory"] if MongoUtil.isExist("emotion_comment",{"appid":appid}): print(appname+"已经存在了") print() return results = MongoUtil.find(cataname,{"appid":appid}) print(cataname,appname) comments = {} pos_count = 0 neg_count = 0 for item in results: word_id = item["wordid"] location = item["location"] word = MongoUtil.find_one("word_table",{"_id":word_id})["word"] comments.setdefault(location,[]) comments[location].append(word) for key in comments.keys(): comment_words = comments[key] pred = predict(model,comment_words,best_words) emotion = judgeCommentEmotion(pred.prob('pos'),pred.prob('neg')) if emotion == 1 : pos_count += 1 if emotion == 2 : neg_count += 1 savetoDB(appid,len(comments),pos_count,neg_count)
def scanMostFastGrownApps(order=-1,limit=50,capacity_limit = 10000,date = "2017-01-23"): capacity_low_limit = 10000 results = MongoUtil.sort_with_values("capacity_rate_table",{"date":date},"incre_rate",order = order) for result in results: limit -=1 appid = result["appid"] appinfo = MongoUtil.find_one("app_table",{"_id":appid}) capacityinfo = MongoUtil.find_one("capacity_table",{"appid":appid,"date":date}) if capacityinfo is None or capacityinfo["capacity_num"] < capacity_low_limit: continue appinfo["incre_rate"] = result["incre_rate"] appinfo["wilson_lower_rate"] = result["wilson_lower_rate"] print(appinfo) print() if limit <=0 : break
def getRecommendInfo(appinfo, date): recommend_info = {} capacity_info = MongoUtil.find_one("capacity_table", { "appid": appinfo["_id"], "date": date }) if capacity_info is None: # print(appinfo["appname"],end=" 1\n") return None capacity_rate_info = MongoUtil.find_one("capacity_rate_table", { "appid": appinfo["_id"], "date": date }) if capacity_rate_info is None: # print(appinfo["appname"],end=" 2\n") return None comment_info = MongoUtil.find_one("emotion_comment", {"appid": appinfo["_id"]}) if comment_info is None: # print(appinfo["appname"],end=" 3\n") return None try: recommend_info["appname"] = appinfo["appname"] recommend_info["catagory"] = appinfo["catagory"] recommend_info["appid"] = appinfo["_id"] recommend_info["capacity"] = capacity_info["capacity_num"] recommend_info["date"] = date recommend_info["capacity_rate"] = capacity_rate_info["incre_rate"] recommend_info["comment_wilson_lower_score"] = comment_info[ "wilson_lower_score"] recommend_info["comment_count"] = comment_info["comment_count"] recommend_info["recommend_score"] = ( getLastCapacityNormalization(recommend_info["capacity_rate"]) * last_capacity_rate_param + getCapacityNormalization(recommend_info["capacity"]) * capacity_param + getApplauseNormalization( recommend_info["comment_wilson_lower_score"]) * applause_param + getCommentCountNormalization(recommend_info["comment_count"]) * comment_count_param + correct(recommend_info)) except: print("-->" + recommend_info["appname"]) return recommend_info
def scanMostCapacityApps(limit=50): results = MongoUtil.capacity_find_most(limit) for result in results: result = result["value"] appid = result["appid"] appinfo = MongoUtil.find_one("app_table",{"_id":appid}) appinfo["capacity_num"] = result["capacity_num"] appinfo["date"] = result["date"] print(appinfo) print()
def deliveryWords(appinfo,filename): print(appinfo.name) contents = [line.strip() for line in open(filename)] wordlist = [] line_num = 0 result = MongoUtil.find_one("app_table", {"catagory":appinfo.cata, "appname":appinfo.name}) if result==None: print("\""+appinfo.cata+" "+appinfo.name+"\" 未存入数据库中,请先存储") return appid = result['_id'] result = MongoUtil.find_one(appinfo.cata, {"appid":appid}) # result = MongoUtil.find_one("wordlocation_table",{"appid":appid}) if result!=None: print("\""+appinfo.cata+" "+appinfo.name+"\" 已经分词存入数据库,不必重复") return for line in contents: time.sleep(0.1) line_num+=1 # 去除乱码 line = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', line) # 使用全模式 seglist = jieba.cut(line,cut_all=False) wordlist.append(seglist) for word in seglist: if word not in stopWords and word not in punctuations and word != '\n' and word!=' ' and not word.isdigit(): # print(word,end=",") post_word = {} post_word["word"]=word if not MongoUtil.isExist("word_table", post_word): MongoUtil.insert("word_table", post_word) result = MongoUtil.find_one("word_table", post_word) wordid = result['_id'] if wordid==None: print(post_word) post_location ={} post_location["appid"]=appid post_location["wordid"]=wordid post_location["location"]=line_num MongoUtil.insert(appinfo.cata, post_location)
def __init__(self, appname, cataname=""): self.tf_idfdict = None if cataname == "": self.app = MongoUtil.find_one("app_table", {"appname": appname}) else: self.app = MongoUtil.find_one("app_table", { "catagory": cataname, "appname": appname }) if self.app is None: print("该app未存储在数据库,可能原因:查询不准确,未存储入数据库,数据未更新") print(self.app) self.worddict, self.wordcount = self.frequencyscore() if self.wordcount < 100: print("该app的评论数量过少,获取关键词将会不准确") return print("评论总数是:" + str(self.wordcount)) self.tf_idfdict = self.tf_idf()
def get_app_each_comment(appname,cataname =""): if cataname == "": app = MongoUtil.find_one("app_table", {"appname":appname}) else: app = MongoUtil.find_one("app_table", {"catagory":cataname, "appname":appname}) print(app) if app is None: return app_id = app["_id"] app_cata = app["catagory"] results = MongoUtil.find(app_cata,{"appid":app_id}) comments = {} for item in results: word_id = item["wordid"] location = item["location"] word = MongoUtil.find_one("word_table",{"_id":word_id})["word"] comments.setdefault(location,[]) comments[location].append(word) return comments
def frequencyscore(self): worddict = {} wordcount = 0 cur = MongoUtil.find(self.app["catagory"], {"appid": self.app["_id"]}) for locationinfo in cur: wordinfo = MongoUtil.find_one("word_table", {"_id": locationinfo["wordid"]}) word = wordinfo["word"] worddict.setdefault(word, 0) worddict[word] += 1 wordcount += 1 return worddict, wordcount
def scanMostPositiveApps(order=-1,limit=50): results = MongoUtil.sort("emotion_comment","wilson_lower_score",order = order,limit = limit) for result in results: appid = result["appid"] appinfo = MongoUtil.find_one("app_table",{"_id":appid}) appinfo["comment_count"] = result["comment_count"] appinfo["pos_count"] = result["pos_count"] appinfo["neg_count"] = result["neg_count"] appinfo["applause_rate"] = result["applause_rate"] # appinfo["wilson_top_score"] = result["neg_count"] appinfo["wilson_lower_score"] = result["wilson_lower_score"] print(appinfo) print()
def tf_idf(self): if self.worddict == None or len(self.worddict) == 0: print("请初始化词频统计") return if self.wordcount < 100: print("该app的评论数量过少,获取关键词将会不准确") return #文档总数 docu_count = len( MongoUtil.distinct_count(self.app["catagory"], "appid", value=None)) #减去它本身 docu_count -= 1 tf_idfdict = {} for item in self.worddict.items(): result = MongoUtil.find_one("word_table", {"word": item[0]}) wordid = result["_id"] include_count = len( MongoUtil.distinct_count(self.app["catagory"], "appid", value={"wordid": wordid})) #减去它本身 include_count -= 1 # print(item[0]+"->"+str(item[1])+" 包含的总文档数"+str(include_count)) # print(str(docu_count) + " "+str(include_count)) if docu_count <= 0: docu_count = 0 wordidf = float(math.log(docu_count / (include_count + 1))) wordtf = float(item[1] / self.wordcount) tf_idfdict[item[0]] = wordtf * wordidf for item in tf_idfdict.items(): print(item[0] + " 出现的次数:" + str(self.worddict[item[0]]) + " tf-idf计算值:" + str(item[1])) return tf_idfdict
def delivery_words(appid,content): # 去除乱码 content = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', content) # 使用全模式 seglist = jieba.cut(content,cut_all=False) for word in seglist: if word not in stopWords and word not in punctuations and word != '\n' and word!=' ' and not word.isdigit(): post_word = {} post_word["word"]=word if not MongoUtil.isExist("word_table", post_word): MongoUtil.insert("word_table", post_word) result = MongoUtil.find_one("word_table", post_word) wordid = result['_id'] if wordid==None: print(post_word) post_location ={} post_location["appid"]=appid post_location["wordid"]=wordid posts.append(post_location)
def getCataAppsInfo(filename): print(filename) date, count = readInfoFile(filename) catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE)) for cataname in catas: cataname = cataname.strip() catafilename = filename + "/" + cataname + ".json" apps = json.load(open(catafilename)) for app in apps.items(): name = app[0].strip().replace("/", " ") # capacity = app time.sleep(0.2) post = {"catagory": cataname, "appname": name} result = MongoUtil.find_one("app_table", post) print("存入的app" + name) if result == None: print(cataname + "->" + name + " ->该app未存入数据库") app_not_exist.append(app) else: appid = result["_id"] capacity = app[1]["install"] # capacity = install2num(capacity) saveAppCapacityToDB(appid, date, capacity)
def deleteAppDieveryWord(cataname,appname): id = MongoUtil.find_one("app_table", {"appname":appname})["_id"] result = MongoUtil.remove(cataname, {"appid":id}) print("已从“"+cataname+"”数据库中删除“"+appname+"”应用的分词信息")