def build_user_dict(): """ 创建新的用户自定义词典 """ DIR = dirname(os.path.abspath(__file__)) # 666 filePermision = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH userDictFile = os.path.join(DIR, "userdict.txt") useUserDefinedDict = getDBConfigure("USE_USER_DEFINED_DICT", default=0, type_=lambda x: bool(int(x))) if not useUserDefinedDict: nullUserDictFile = os.path.join(DIR, "nulluserdict.txt") if not os.path.exists(nullUserDictFile): with open(nullUserDictFile, "w+") as f: print "created nulluserdict.txt" os.chmod(nullUserDictFile, filePermision) print "USE_USER_DEFINED_DICT=OFF,use null dict instead" return nullUserDictFile needRebuildUserDefinedDict = getDBConfigure("RE_BUILD_USER_DEFINED_DICT", default=1, type_=lambda x: bool(int(x))) if not needRebuildUserDefinedDict: if not os.path.exists(userDictFile): with open(userDictFile, "w+") as f: print "File userdict.txt does not exist,create an empty one" os.chmod(userDictFile, filePermision) print "use the old userdict.txt file" return userDictFile ## # 需要重新制作字典 ## setDBConfigure("RE_BUILD_USER_DEFINED_DICT", option=0) # 不用再重新制作字典了,重置标志位 if os.path.exists(userDictFile): os.remove(userDictFile) template_line = u"{word} {frequency} {characteristic}\n" with open(userDictFile, 'w') as f: wordCount = Vocabulary.objects.filter(brand="user").count() step = 300 writted = 0 while writted < wordCount: words = Vocabulary.objects.filter(brand="user")\ .values("word","frequency","characteristic")\ .all()[writted:writted+step] for word in words: f.write(template_line.format(**word).encode("utf-8")) writted = writted + step os.chmod(userDictFile, filePermision) print "rebuilt the userdict.txt" return userDictFile
def newsvolume(): """ 得到总数,并且缓存指定时间 """ if not cache.get("news_amount"): #print "not using cache" news_amount = News.objects.all().count() newsVolumeCountCacheLife = getDBConfigure("NEWS_COUNT_CACHE_LIFE", default=4 * 60 * 40, type_=int) cache.set('news_amount', news_amount, newsVolumeCountCacheLife) return cache.get("news_amount")
def hotwords(): #words_list = HotWords.read_most() if not cache.get("hot_words_list"): ## # get setting from db # RECOMM_CACHE_LIFE = 30 ## RECOMM_CACHE_LIFE = getDBConfigure("RECOMM_CACHE_LIFE", default="30", type_=int) hot_words_list = HotWords.appear_most() cache.set("hot_words_list", hot_words_list, RECOMM_CACHE_LIFE) return {'hotwords': cache.get("hot_words_list")}
def recommend_words(): """ 同hotwords,只是使用的方式不同 """ if not cache.get("hot_words_list"): ## # get setting from db # RECOMM_CACHE_LIFE = 30 ## RECOMM_CACHE_LIFE = getDBConfigure("RECOMM_CACHE_LIFE", default="30", type_=int) hot_words_list = HotWords.appear_most() cache.set("hot_words_list", hot_words_list, RECOMM_CACHE_LIFE) return cache.get("hot_words_list")
def appear_most(cls): """ 查找tilte当中出现最多的关键词 """ cls.init() ## # get configure from database # # RECOMM_DAYS = 2 # RECOMM_RANK_GT = 200 # RECOMM_NEWSLIMIT = RECOMM_DAYS * RECOMM_RANK_GT # RECOMM_HALF_DESC = 30 # RECOMM_NUM = 8 # 这里的配置不用设置缓存时间,每次直接实时读数据库就好 ## RECOMM_DAYS = getDBConfigure("RECOMM_DAYS",default=2,type_=int) RECOMM_RANK_GT = getDBConfigure("RECOMM_RANK_GT",default=200,type_=int) #RECOMM_NEWSLIMIT = RECOMM_DAYS * RECOMM_RANK_GT RECOMM_NEWSLIMIT = getDBConfigure("RECOMM_NEWSLIMIT",default=400,type_=int) RECOMM_HALF_DESC = getDBConfigure("RECOMM_HALF_DESC",default="30.0",type_=float) RECOMM_NUM = getDBConfigure("RECOMM_NUM",default=8,type_=int) RECOMM_RECORD_HOT = getDBConfigure("RECOMM_RECORD_HOT",default=0,type_=lambda v:bool(int(v))) RECOMM_ALLOWED_WORD_TYPE_REGEX = getDBConfigure("RECOMM_ALLOWED_WORD_TYPE_REGEX",default="(nr|nz|nt)",type_=str) # 设置词性的正则表达式 cls.regex_allowed = re.compile(RECOMM_ALLOWED_WORD_TYPE_REGEX) ############END SETTINGS FORM DATABASE################# today = datetime.date.today() oneday = datetime.timedelta(days=RECOMM_DAYS) yesterday=today - oneday recent_news = News.objects.filter(news_time__gte=yesterday,news_time__lte=today,rank__gt=(RANK_SORT_PARAMETER-RECOMM_RANK_GT))\ .only("title")\ .order_by("-news_time","-rank")[0:RECOMM_NEWSLIMIT] #tagList = [] tagMap = dict() for news in recent_news.values("id","title","rank"): tags = jieba.posseg.cut(news["title"]) tags = filter(cls.filter_out_short,tags) #tags = filter(cls.filter_out_verb,tags) tags = filter(cls.filter_out_number,tags) tags = filter(cls.filter_out_deny,tags) tags = filter(cls.filter_in_only,tags) #map(lambda t:cls.upsert(t,news.rank,tagMap),tags) for tag in tags: score = RECOMM_HALF_DESC/(RANK_SORT_PARAMETER-news["rank"]+int(RECOMM_HALF_DESC)) key = tag.word if tagMap.has_key(key): tagMap[key] = score + tagMap[key] else: tagMap[key] = score #print "[%d]the score for %s is %f" % (news.rank,key,tagMap[key]) #counter = Counter(tagList) #print "the length of tagList is %d" % len(tagList) hot_word_tube = sorted(tagMap.items(),key=lambda x:x[1],reverse=True) #hot_word_tube = counter.most_common(8) #hot_words = [ str(x[0])+"/"+str(x[1]) for x in hot_word_tube] hot_words = [ x[0] for x in hot_word_tube[0:RECOMM_NUM]] # 根据是否开启热词记录 if RECOMM_RECORD_HOT: for i,w in enumerate(hot_word_tube[0:RECOMM_NUM]): try: hotword = HotWordTrace.objects.only("word")\ .get(time=datetime.date.today(),word=w[0]) except HotWordTrace.DoesNotExist: note="using %s/(1+%s)" % (RECOMM_HALF_DESC,RECOMM_HALF_DESC) # for debug popurse hotword = HotWordTrace(word=w[0],rank=i+1,score=w[1],note=note) hotword.save() except HotWordTrace.MultipleObjectsReturned: hotword = HotWordTrace.objects.only("id")\ .filter(time=datetime.date.today(),word=w[0]) theSameId = [x[0] for x in hotword.values_list("id")[1:]] HotWordTrace.objects.filter(id__in=theSameId).delete() return hot_words